aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMike Travis <travis@sgi.com>2008-12-31 20:34:16 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-03 12:53:31 -0500
commit7eb19553369c46cc1fa64caf120cbcab1b597f7c (patch)
treeef1a3beae706b9497c845d0a2557ceb4d2754998 /kernel
parent6092848a2a23b660150a38bc06f59d75838d70c8 (diff)
parent8c384cdee3e04d6194a2c2b192b624754f990835 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-cpumask into merge-rr-cpumask
Conflicts: arch/x86/kernel/io_apic.c kernel/rcuclassic.c kernel/sched.c kernel/time/tick-sched.c Signed-off-by: Mike Travis <travis@sgi.com> [ mingo@elte.hu: backmerged typo fix for io_apic.c ] Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/auditsc.c255
-rw-r--r--kernel/capability.c288
-rw-r--r--kernel/cgroup.c25
-rw-r--r--kernel/compat.c49
-rw-r--r--kernel/cpu.c144
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c588
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c25
-rw-r--r--kernel/fork.c68
-rw-r--r--kernel/futex.c81
-rw-r--r--kernel/futex_compat.c7
-rw-r--r--kernel/hrtimer.c331
-rw-r--r--kernel/irq/chip.c1
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/manage.c38
-rw-r--r--kernel/irq/numa_migrate.c11
-rw-r--r--kernel/irq/proc.c32
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c30
-rw-r--r--kernel/lockdep.c27
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/posix-timers.c46
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/profile.c38
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcuclassic.c29
-rw-r--r--kernel/rcupreempt.c29
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c93
-rw-r--r--kernel/rcutree.c1535
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/resource.c9
-rw-r--r--kernel/sched.c115
-rw-r--r--kernel/sched_fair.c9
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/sched_stats.h5
-rw-r--r--kernel/signal.c60
-rw-r--r--kernel/smp.c145
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/softlockup.c10
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c586
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/taskstats.c39
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-broadcast.c115
-rw-r--r--kernel/time/tick-common.c6
-rw-r--r--kernel/time/tick-sched.c44
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/Kconfig17
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c15
-rw-r--r--kernel/trace/ring_buffer.c103
-rw-r--r--kernel/trace/trace.c156
-rw-r--r--kernel/trace/trace.h20
-rw-r--r--kernel/trace/trace_boot.c14
-rw-r--r--kernel/trace/trace_bts.c276
-rw-r--r--kernel/trace/trace_functions.c14
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_hw_branches.c195
-rw-r--r--kernel/trace/trace_mmiotrace.c6
-rw-r--r--kernel/trace/trace_power.c2
-rw-r--r--kernel/trace/trace_sched_switch.c17
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_stack.c49
-rw-r--r--kernel/trace/trace_sysprof.c26
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/uid16.c31
-rw-r--r--kernel/user.c96
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/workqueue.c34
81 files changed, 4636 insertions, 2040 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 6a212b842d86..e1c5bf3365c0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
@@ -73,10 +73,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
73obj-$(CONFIG_SECCOMP) += seccomp.o 73obj-$(CONFIG_SECCOMP) += seccomp.o
74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
76obj-$(CONFIG_TREE_RCU) += rcutree.o
76obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
77ifeq ($(CONFIG_PREEMPT_RCU),y) 78obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
78obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 79obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
79endif
80obj-$(CONFIG_RELAY) += relay.o 80obj-$(CONFIG_RELAY) += relay.o
81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5d..d57b7cbb98b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
530 do_div(elapsed, AHZ); 530 do_div(elapsed, AHZ);
531 ac.ac_btime = get_seconds() - elapsed; 531 ac.ac_btime = get_seconds() - elapsed;
532 /* we really need to bite the bullet and change layout */ 532 /* we really need to bite the bullet and change layout */
533 ac.ac_uid = current->uid; 533 current_uid_gid(&ac.ac_uid, &ac.ac_gid);
534 ac.ac_gid = current->gid;
535#if ACCT_VERSION==2 534#if ACCT_VERSION==2
536 ac.ac_ahz = AHZ; 535 ac.ac_ahz = AHZ;
537#endif 536#endif
538#if ACCT_VERSION==1 || ACCT_VERSION==2 537#if ACCT_VERSION==1 || ACCT_VERSION==2
539 /* backward-compatible 16 bit fields */ 538 /* backward-compatible 16 bit fields */
540 ac.ac_uid16 = current->uid; 539 ac.ac_uid16 = ac.ac_uid;
541 ac.ac_gid16 = current->gid; 540 ac.ac_gid16 = ac.ac_gid;
542#endif 541#endif
543#if ACCT_VERSION==3 542#if ACCT_VERSION==3
544 ac.ac_pid = task_tgid_nr_ns(current, ns); 543 ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2a3f0afc4d2a..4819f3711973 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
65#include <linux/highmem.h> 65#include <linux/highmem.h>
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h>
68 69
69#include "audit.h" 70#include "audit.h"
70 71
@@ -84,6 +85,15 @@ int audit_n_rules;
84/* determines whether we collect data for signals sent */ 85/* determines whether we collect data for signals sent */
85int audit_signals; 86int audit_signals;
86 87
88struct audit_cap_data {
89 kernel_cap_t permitted;
90 kernel_cap_t inheritable;
91 union {
92 unsigned int fE; /* effective bit of a file capability */
93 kernel_cap_t effective; /* effective set of a process */
94 };
95};
96
87/* When fs/namei.c:getname() is called, we store the pointer in name and 97/* When fs/namei.c:getname() is called, we store the pointer in name and
88 * we don't let putname() free it (instead we free all of the saved 98 * we don't let putname() free it (instead we free all of the saved
89 * pointers at syscall exit time). 99 * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
100 gid_t gid; 110 gid_t gid;
101 dev_t rdev; 111 dev_t rdev;
102 u32 osid; 112 u32 osid;
113 struct audit_cap_data fcap;
114 unsigned int fcap_ver;
103}; 115};
104 116
105struct audit_aux_data { 117struct audit_aux_data {
@@ -184,6 +196,20 @@ struct audit_aux_data_pids {
184 int pid_count; 196 int pid_count;
185}; 197};
186 198
199struct audit_aux_data_bprm_fcaps {
200 struct audit_aux_data d;
201 struct audit_cap_data fcap;
202 unsigned int fcap_ver;
203 struct audit_cap_data old_pcap;
204 struct audit_cap_data new_pcap;
205};
206
207struct audit_aux_data_capset {
208 struct audit_aux_data d;
209 pid_t pid;
210 struct audit_cap_data cap;
211};
212
187struct audit_tree_refs { 213struct audit_tree_refs {
188 struct audit_tree_refs *next; 214 struct audit_tree_refs *next;
189 struct audit_chunk *c[31]; 215 struct audit_chunk *c[31];
@@ -421,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
421 struct audit_names *name, 447 struct audit_names *name,
422 enum audit_state *state) 448 enum audit_state *state)
423{ 449{
450 const struct cred *cred = get_task_cred(tsk);
424 int i, j, need_sid = 1; 451 int i, j, need_sid = 1;
425 u32 sid; 452 u32 sid;
426 453
@@ -440,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
440 } 467 }
441 break; 468 break;
442 case AUDIT_UID: 469 case AUDIT_UID:
443 result = audit_comparator(tsk->uid, f->op, f->val); 470 result = audit_comparator(cred->uid, f->op, f->val);
444 break; 471 break;
445 case AUDIT_EUID: 472 case AUDIT_EUID:
446 result = audit_comparator(tsk->euid, f->op, f->val); 473 result = audit_comparator(cred->euid, f->op, f->val);
447 break; 474 break;
448 case AUDIT_SUID: 475 case AUDIT_SUID:
449 result = audit_comparator(tsk->suid, f->op, f->val); 476 result = audit_comparator(cred->suid, f->op, f->val);
450 break; 477 break;
451 case AUDIT_FSUID: 478 case AUDIT_FSUID:
452 result = audit_comparator(tsk->fsuid, f->op, f->val); 479 result = audit_comparator(cred->fsuid, f->op, f->val);
453 break; 480 break;
454 case AUDIT_GID: 481 case AUDIT_GID:
455 result = audit_comparator(tsk->gid, f->op, f->val); 482 result = audit_comparator(cred->gid, f->op, f->val);
456 break; 483 break;
457 case AUDIT_EGID: 484 case AUDIT_EGID:
458 result = audit_comparator(tsk->egid, f->op, f->val); 485 result = audit_comparator(cred->egid, f->op, f->val);
459 break; 486 break;
460 case AUDIT_SGID: 487 case AUDIT_SGID:
461 result = audit_comparator(tsk->sgid, f->op, f->val); 488 result = audit_comparator(cred->sgid, f->op, f->val);
462 break; 489 break;
463 case AUDIT_FSGID: 490 case AUDIT_FSGID:
464 result = audit_comparator(tsk->fsgid, f->op, f->val); 491 result = audit_comparator(cred->fsgid, f->op, f->val);
465 break; 492 break;
466 case AUDIT_PERS: 493 case AUDIT_PERS:
467 result = audit_comparator(tsk->personality, f->op, f->val); 494 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -615,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
615 break; 642 break;
616 } 643 }
617 644
618 if (!result) 645 if (!result) {
646 put_cred(cred);
619 return 0; 647 return 0;
648 }
620 } 649 }
621 if (rule->filterkey && ctx) 650 if (rule->filterkey && ctx)
622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -624,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
625 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
626 } 655 }
656 put_cred(cred);
627 return 1; 657 return 1;
628} 658}
629 659
@@ -1171,8 +1201,38 @@ static void audit_log_execve_info(struct audit_context *context,
1171 kfree(buf); 1201 kfree(buf);
1172} 1202}
1173 1203
1204static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1205{
1206 int i;
1207
1208 audit_log_format(ab, " %s=", prefix);
1209 CAP_FOR_EACH_U32(i) {
1210 audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1211 }
1212}
1213
1214static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1215{
1216 kernel_cap_t *perm = &name->fcap.permitted;
1217 kernel_cap_t *inh = &name->fcap.inheritable;
1218 int log = 0;
1219
1220 if (!cap_isclear(*perm)) {
1221 audit_log_cap(ab, "cap_fp", perm);
1222 log = 1;
1223 }
1224 if (!cap_isclear(*inh)) {
1225 audit_log_cap(ab, "cap_fi", inh);
1226 log = 1;
1227 }
1228
1229 if (log)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231}
1232
1174static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1175{ 1234{
1235 const struct cred *cred;
1176 int i, call_panic = 0; 1236 int i, call_panic = 0;
1177 struct audit_buffer *ab; 1237 struct audit_buffer *ab;
1178 struct audit_aux_data *aux; 1238 struct audit_aux_data *aux;
@@ -1182,14 +1242,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1182 context->pid = tsk->pid; 1242 context->pid = tsk->pid;
1183 if (!context->ppid) 1243 if (!context->ppid)
1184 context->ppid = sys_getppid(); 1244 context->ppid = sys_getppid();
1185 context->uid = tsk->uid; 1245 cred = current_cred();
1186 context->gid = tsk->gid; 1246 context->uid = cred->uid;
1187 context->euid = tsk->euid; 1247 context->gid = cred->gid;
1188 context->suid = tsk->suid; 1248 context->euid = cred->euid;
1189 context->fsuid = tsk->fsuid; 1249 context->suid = cred->suid;
1190 context->egid = tsk->egid; 1250 context->fsuid = cred->fsuid;
1191 context->sgid = tsk->sgid; 1251 context->egid = cred->egid;
1192 context->fsgid = tsk->fsgid; 1252 context->sgid = cred->sgid;
1253 context->fsgid = cred->fsgid;
1193 context->personality = tsk->personality; 1254 context->personality = tsk->personality;
1194 1255
1195 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1256 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1334,6 +1395,28 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1334 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1335 break; } 1396 break; }
1336 1397
1398 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver);
1401 audit_log_cap(ab, "fp", &axs->fcap.permitted);
1402 audit_log_cap(ab, "fi", &axs->fcap.inheritable);
1403 audit_log_format(ab, " fe=%d", axs->fcap.fE);
1404 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1405 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1406 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1407 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
1408 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; }
1411
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1337 } 1420 }
1338 audit_log_end(ab); 1421 audit_log_end(ab);
1339 } 1422 }
@@ -1421,6 +1504,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1421 } 1504 }
1422 } 1505 }
1423 1506
1507 audit_log_fcaps(ab, n);
1508
1424 audit_log_end(ab); 1509 audit_log_end(ab);
1425 } 1510 }
1426 1511
@@ -1802,8 +1887,36 @@ static int audit_inc_name_count(struct audit_context *context,
1802 return 0; 1887 return 0;
1803} 1888}
1804 1889
1890
1891static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1892{
1893 struct cpu_vfs_cap_data caps;
1894 int rc;
1895
1896 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1897 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1898 name->fcap.fE = 0;
1899 name->fcap_ver = 0;
1900
1901 if (!dentry)
1902 return 0;
1903
1904 rc = get_vfs_caps_from_disk(dentry, &caps);
1905 if (rc)
1906 return rc;
1907
1908 name->fcap.permitted = caps.permitted;
1909 name->fcap.inheritable = caps.inheritable;
1910 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1911 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
1912
1913 return 0;
1914}
1915
1916
1805/* Copy inode data into an audit_names. */ 1917/* Copy inode data into an audit_names. */
1806static void audit_copy_inode(struct audit_names *name, const struct inode *inode) 1918static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1919 const struct inode *inode)
1807{ 1920{
1808 name->ino = inode->i_ino; 1921 name->ino = inode->i_ino;
1809 name->dev = inode->i_sb->s_dev; 1922 name->dev = inode->i_sb->s_dev;
@@ -1812,6 +1925,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1812 name->gid = inode->i_gid; 1925 name->gid = inode->i_gid;
1813 name->rdev = inode->i_rdev; 1926 name->rdev = inode->i_rdev;
1814 security_inode_getsecid(inode, &name->osid); 1927 security_inode_getsecid(inode, &name->osid);
1928 audit_copy_fcaps(name, dentry);
1815} 1929}
1816 1930
1817/** 1931/**
@@ -1846,7 +1960,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1846 context->names[idx].name = NULL; 1960 context->names[idx].name = NULL;
1847 } 1961 }
1848 handle_path(dentry); 1962 handle_path(dentry);
1849 audit_copy_inode(&context->names[idx], inode); 1963 audit_copy_inode(&context->names[idx], dentry, inode);
1850} 1964}
1851 1965
1852/** 1966/**
@@ -1907,7 +2021,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
1907 if (!strcmp(dname, n->name) || 2021 if (!strcmp(dname, n->name) ||
1908 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2022 !audit_compare_dname_path(dname, n->name, &dirlen)) {
1909 if (inode) 2023 if (inode)
1910 audit_copy_inode(n, inode); 2024 audit_copy_inode(n, NULL, inode);
1911 else 2025 else
1912 n->ino = (unsigned long)-1; 2026 n->ino = (unsigned long)-1;
1913 found_child = n->name; 2027 found_child = n->name;
@@ -1921,7 +2035,7 @@ add_names:
1921 return; 2035 return;
1922 idx = context->name_count - 1; 2036 idx = context->name_count - 1;
1923 context->names[idx].name = NULL; 2037 context->names[idx].name = NULL;
1924 audit_copy_inode(&context->names[idx], parent); 2038 audit_copy_inode(&context->names[idx], NULL, parent);
1925 } 2039 }
1926 2040
1927 if (!found_child) { 2041 if (!found_child) {
@@ -1942,7 +2056,7 @@ add_names:
1942 } 2056 }
1943 2057
1944 if (inode) 2058 if (inode)
1945 audit_copy_inode(&context->names[idx], inode); 2059 audit_copy_inode(&context->names[idx], NULL, inode);
1946 else 2060 else
1947 context->names[idx].ino = (unsigned long)-1; 2061 context->names[idx].ino = (unsigned long)-1;
1948 } 2062 }
@@ -1996,7 +2110,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1996 audit_log_format(ab, "login pid=%d uid=%u " 2110 audit_log_format(ab, "login pid=%d uid=%u "
1997 "old auid=%u new auid=%u" 2111 "old auid=%u new auid=%u"
1998 " old ses=%u new ses=%u", 2112 " old ses=%u new ses=%u",
1999 task->pid, task->uid, 2113 task->pid, task_uid(task),
2000 task->loginuid, loginuid, 2114 task->loginuid, loginuid,
2001 task->sessionid, sessionid); 2115 task->sessionid, sessionid);
2002 audit_log_end(ab); 2116 audit_log_end(ab);
@@ -2379,7 +2493,7 @@ void __audit_ptrace(struct task_struct *t)
2379 2493
2380 context->target_pid = t->pid; 2494 context->target_pid = t->pid;
2381 context->target_auid = audit_get_loginuid(t); 2495 context->target_auid = audit_get_loginuid(t);
2382 context->target_uid = t->uid; 2496 context->target_uid = task_uid(t);
2383 context->target_sessionid = audit_get_sessionid(t); 2497 context->target_sessionid = audit_get_sessionid(t);
2384 security_task_getsecid(t, &context->target_sid); 2498 security_task_getsecid(t, &context->target_sid);
2385 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2499 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2398,6 +2512,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2398 struct audit_aux_data_pids *axp; 2512 struct audit_aux_data_pids *axp;
2399 struct task_struct *tsk = current; 2513 struct task_struct *tsk = current;
2400 struct audit_context *ctx = tsk->audit_context; 2514 struct audit_context *ctx = tsk->audit_context;
2515 uid_t uid = current_uid(), t_uid = task_uid(t);
2401 2516
2402 if (audit_pid && t->tgid == audit_pid) { 2517 if (audit_pid && t->tgid == audit_pid) {
2403 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2518 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2405,7 +2520,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2405 if (tsk->loginuid != -1) 2520 if (tsk->loginuid != -1)
2406 audit_sig_uid = tsk->loginuid; 2521 audit_sig_uid = tsk->loginuid;
2407 else 2522 else
2408 audit_sig_uid = tsk->uid; 2523 audit_sig_uid = uid;
2409 security_task_getsecid(tsk, &audit_sig_sid); 2524 security_task_getsecid(tsk, &audit_sig_sid);
2410 } 2525 }
2411 if (!audit_signals || audit_dummy_context()) 2526 if (!audit_signals || audit_dummy_context())
@@ -2417,7 +2532,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2417 if (!ctx->target_pid) { 2532 if (!ctx->target_pid) {
2418 ctx->target_pid = t->tgid; 2533 ctx->target_pid = t->tgid;
2419 ctx->target_auid = audit_get_loginuid(t); 2534 ctx->target_auid = audit_get_loginuid(t);
2420 ctx->target_uid = t->uid; 2535 ctx->target_uid = t_uid;
2421 ctx->target_sessionid = audit_get_sessionid(t); 2536 ctx->target_sessionid = audit_get_sessionid(t);
2422 security_task_getsecid(t, &ctx->target_sid); 2537 security_task_getsecid(t, &ctx->target_sid);
2423 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2538 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2438,7 +2553,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2438 2553
2439 axp->target_pid[axp->pid_count] = t->tgid; 2554 axp->target_pid[axp->pid_count] = t->tgid;
2440 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2555 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2441 axp->target_uid[axp->pid_count] = t->uid; 2556 axp->target_uid[axp->pid_count] = t_uid;
2442 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2557 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2443 security_task_getsecid(t, &axp->target_sid[axp->pid_count]); 2558 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2444 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2559 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2448,6 +2563,86 @@ int __audit_signal_info(int sig, struct task_struct *t)
2448} 2563}
2449 2564
2450/** 2565/**
2566 * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
2567 * @bprm: pointer to the bprm being processed
2568 * @new: the proposed new credentials
2569 * @old: the old credentials
2570 *
2571 * Simply check if the proc already has the caps given by the file and if not
2572 * store the priv escalation info for later auditing at the end of the syscall
2573 *
2574 * -Eric
2575 */
2576int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2577 const struct cred *new, const struct cred *old)
2578{
2579 struct audit_aux_data_bprm_fcaps *ax;
2580 struct audit_context *context = current->audit_context;
2581 struct cpu_vfs_cap_data vcaps;
2582 struct dentry *dentry;
2583
2584 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2585 if (!ax)
2586 return -ENOMEM;
2587
2588 ax->d.type = AUDIT_BPRM_FCAPS;
2589 ax->d.next = context->aux;
2590 context->aux = (void *)ax;
2591
2592 dentry = dget(bprm->file->f_dentry);
2593 get_vfs_caps_from_disk(dentry, &vcaps);
2594 dput(dentry);
2595
2596 ax->fcap.permitted = vcaps.permitted;
2597 ax->fcap.inheritable = vcaps.inheritable;
2598 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2599 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2600
2601 ax->old_pcap.permitted = old->cap_permitted;
2602 ax->old_pcap.inheritable = old->cap_inheritable;
2603 ax->old_pcap.effective = old->cap_effective;
2604
2605 ax->new_pcap.permitted = new->cap_permitted;
2606 ax->new_pcap.inheritable = new->cap_inheritable;
2607 ax->new_pcap.effective = new->cap_effective;
2608 return 0;
2609}
2610
2611/**
2612 * __audit_log_capset - store information about the arguments to the capset syscall
2613 * @pid: target pid of the capset call
2614 * @new: the new credentials
2615 * @old: the old (current) credentials
2616 *
2617 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable
2619 */
2620int __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old)
2622{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context;
2625
2626 if (likely(!audit_enabled || !context || context->dummy))
2627 return 0;
2628
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643}
2644
2645/**
2451 * audit_core_dumps - record information about processes that end abnormally 2646 * audit_core_dumps - record information about processes that end abnormally
2452 * @signr: signal value 2647 * @signr: signal value
2453 * 2648 *
@@ -2458,7 +2653,8 @@ void audit_core_dumps(long signr)
2458{ 2653{
2459 struct audit_buffer *ab; 2654 struct audit_buffer *ab;
2460 u32 sid; 2655 u32 sid;
2461 uid_t auid = audit_get_loginuid(current); 2656 uid_t auid = audit_get_loginuid(current), uid;
2657 gid_t gid;
2462 unsigned int sessionid = audit_get_sessionid(current); 2658 unsigned int sessionid = audit_get_sessionid(current);
2463 2659
2464 if (!audit_enabled) 2660 if (!audit_enabled)
@@ -2468,8 +2664,9 @@ void audit_core_dumps(long signr)
2468 return; 2664 return;
2469 2665
2470 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2667 current_uid_gid(&uid, &gid);
2471 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2668 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2472 auid, current->uid, current->gid, sessionid); 2669 auid, uid, gid, sessionid);
2473 security_task_getsecid(current, &sid); 2670 security_task_getsecid(current, &sid);
2474 if (sid) { 2671 if (sid) {
2475 char *ctx = NULL; 2672 char *ctx = NULL;
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d8..36b4b4daebec 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/audit.h>
10#include <linux/capability.h> 11#include <linux/capability.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
@@ -14,12 +15,7 @@
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
15#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18#include "cred-internals.h"
18/*
19 * This lock protects task->cap_* for all tasks including current.
20 * Locking rule: acquire this prior to tasklist_lock.
21 */
22static DEFINE_SPINLOCK(task_capability_lock);
23 19
24/* 20/*
25 * Leveraged for setting/resetting capabilities 21 * Leveraged for setting/resetting capabilities
@@ -33,6 +29,17 @@ EXPORT_SYMBOL(__cap_empty_set);
33EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
34EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
35 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1;
34
35static int __init file_caps_disable(char *str)
36{
37 file_caps_enabled = 0;
38 return 1;
39}
40__setup("no_file_caps", file_caps_disable);
41#endif
42
36/* 43/*
37 * More recent versions of libcap are available from: 44 * More recent versions of libcap are available from:
38 * 45 *
@@ -115,167 +122,12 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 122 return 0;
116} 123}
117 124
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/*
121 * Without filesystem capability support, we nominally support one process
122 * setting the capabilities of another
123 */
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
125 kernel_cap_t *pIp, kernel_cap_t *pPp)
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
272/* 125/*
273 * If we have configured with filesystem capability support, then the 126 * The only thing that can change the capabilities of the current
274 * only thing that can change the capabilities of the current process 127 * process is the current process. As such, we can't be in this code
275 * is the current process. As such, we can't be in this code at the 128 * at the same time as we are in the process of setting capabilities
276 * same time as we are in the process of setting capabilities in this 129 * in this process. The net result is that we can limit our use of
277 * process. The net result is that we can limit our use of locks to 130 * locks to when we are reading the caps of another process.
278 * when we are reading the caps of another process.
279 */ 131 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, 132static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp) 133 kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -285,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
285 if (pid && (pid != task_pid_vnr(current))) { 137 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target; 138 struct task_struct *target;
287 139
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock); 140 read_lock(&tasklist_lock);
290 141
291 target = find_task_by_vpid(pid); 142 target = find_task_by_vpid(pid);
@@ -295,50 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
295 ret = security_capget(target, pEp, pIp, pPp); 146 ret = security_capget(target, pEp, pIp, pPp);
296 147
297 read_unlock(&tasklist_lock); 148 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else 149 } else
300 ret = security_capget(current, pEp, pIp, pPp); 150 ret = security_capget(current, pEp, pIp, pPp);
301 151
302 return ret; 152 return ret;
303} 153}
304 154
305/*
306 * With filesystem capability support configured, the kernel does not
307 * permit the changing of capabilities in one process by another
308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
320
321/*
322 * Atomically modify the effective capabilities returning the original
323 * value. No permission check is performed here - it is assumed that the
324 * caller is permitted to set the desired effective capabilities.
325 */
326kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
327{
328 kernel_cap_t pE_old;
329
330 spin_lock(&task_capability_lock);
331
332 pE_old = current->cap_effective;
333 current->cap_effective = pE_new;
334
335 spin_unlock(&task_capability_lock);
336
337 return pE_old;
338}
339
340EXPORT_SYMBOL(cap_set_effective);
341
342/** 155/**
343 * sys_capget - get the capabilities of a given process. 156 * sys_capget - get the capabilities of a given process.
344 * @header: pointer to struct that contains capability version and 157 * @header: pointer to struct that contains capability version and
@@ -366,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
366 return -EINVAL; 179 return -EINVAL;
367 180
368 ret = cap_get_target_pid(pid, &pE, &pI, &pP); 181 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
369
370 if (!ret) { 182 if (!ret) {
371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 183 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
372 unsigned i; 184 unsigned i;
@@ -412,16 +224,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
412 * @data: pointer to struct that contains the effective, permitted, 224 * @data: pointer to struct that contains the effective, permitted,
413 * and inheritable capabilities 225 * and inheritable capabilities
414 * 226 *
415 * Set capabilities for a given process, all processes, or all 227 * Set capabilities for the current process only. The ability to any other
416 * processes in a given process group. 228 * process(es) has been deprecated and removed.
417 * 229 *
418 * The restrictions on setting capabilities are specified as: 230 * The restrictions on setting capabilities are specified as:
419 * 231 *
420 * [pid is for the 'target' task. 'current' is the calling task.] 232 * I: any raised capabilities must be a subset of the old permitted
421 * 233 * P: any raised capabilities must be a subset of the old permitted
422 * I: any raised capabilities must be a subset of the (old current) permitted 234 * E: must be set to a subset of new permitted
423 * P: any raised capabilities must be a subset of the (old current) permitted
424 * E: must be set to a subset of (new target) permitted
425 * 235 *
426 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
427 */ 237 */
@@ -430,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
431 unsigned i, tocopy; 241 unsigned i, tocopy;
432 kernel_cap_t inheritable, permitted, effective; 242 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new;
433 int ret; 244 int ret;
434 pid_t pid; 245 pid_t pid;
435 246
@@ -440,10 +251,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
440 if (get_user(pid, &header->pid)) 251 if (get_user(pid, &header->pid))
441 return -EFAULT; 252 return -EFAULT;
442 253
443 if (copy_from_user(&kdata, data, tocopy 254 /* may only affect current now */
444 * sizeof(struct __user_cap_data_struct))) { 255 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM;
257
258 if (copy_from_user(&kdata, data,
259 tocopy * sizeof(struct __user_cap_data_struct)))
445 return -EFAULT; 260 return -EFAULT;
446 }
447 261
448 for (i = 0; i < tocopy; i++) { 262 for (i = 0; i < tocopy; i++) {
449 effective.cap[i] = kdata[i].effective; 263 effective.cap[i] = kdata[i].effective;
@@ -457,32 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
457 i++; 271 i++;
458 } 272 }
459 273
460 if (pid && (pid != task_pid_vnr(current))) 274 new = prepare_creds();
461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, 275 if (!new)
462 &permitted); 276 return -ENOMEM;
463 else {
464 /*
465 * This lock is required even when filesystem
466 * capability support is configured - it protects the
467 * sys_capget() call from returning incorrect data in
468 * the case that the targeted process is not the
469 * current one.
470 */
471 spin_lock(&task_capability_lock);
472 277
473 ret = security_capset_check(current, &effective, &inheritable, 278 ret = security_capset(new, current_cred(),
474 &permitted); 279 &effective, &inheritable, &permitted);
475 /* 280 if (ret < 0)
476 * Having verified that the proposed changes are 281 goto error;
477 * legal, we now put them into effect. 282
478 */ 283 ret = audit_log_capset(pid, new, current_cred());
479 if (!ret) 284 if (ret < 0)
480 security_capset_set(current, &effective, &inheritable, 285 return ret;
481 &permitted);
482 spin_unlock(&task_capability_lock);
483 }
484 286
287 return commit_creds(new);
485 288
289error:
290 abort_creds(new);
486 return ret; 291 return ret;
487} 292}
488 293
@@ -498,6 +303,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
498 */ 303 */
499int capable(int cap) 304int capable(int cap)
500{ 305{
306 if (unlikely(!cap_valid(cap))) {
307 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
308 BUG();
309 }
310
501 if (has_capability(current, cap)) { 311 if (has_capability(current, cap)) {
502 current->flags |= PF_SUPERPRIV; 312 current->flags |= PF_SUPERPRIV;
503 return 1; 313 return 1;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8185a0f09594..48348dde6d81 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571 571
572 if (inode) { 572 if (inode) {
573 inode->i_mode = mode; 573 inode->i_mode = mode;
574 inode->i_uid = current->fsuid; 574 inode->i_uid = current_fsuid();
575 inode->i_gid = current->fsgid; 575 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0; 576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
@@ -1024,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1024 if (ret == -EBUSY) { 1024 if (ret == -EBUSY) {
1025 mutex_unlock(&cgroup_mutex); 1025 mutex_unlock(&cgroup_mutex);
1026 mutex_unlock(&inode->i_mutex); 1026 mutex_unlock(&inode->i_mutex);
1027 goto drop_new_super; 1027 goto free_cg_links;
1028 } 1028 }
1029 1029
1030 /* EBUSY should be the only error here */ 1030 /* EBUSY should be the only error here */
@@ -1073,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1073 1073
1074 return simple_set_mnt(mnt, sb); 1074 return simple_set_mnt(mnt, sb);
1075 1075
1076 free_cg_links:
1077 free_cg_links(&tmp_cg_links);
1076 drop_new_super: 1078 drop_new_super:
1077 up_write(&sb->s_umount); 1079 up_write(&sb->s_umount);
1078 deactivate_super(sb); 1080 deactivate_super(sb);
1079 free_cg_links(&tmp_cg_links);
1080 return ret; 1081 return ret;
1081} 1082}
1082 1083
@@ -1279,6 +1280,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1279static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1280static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1280{ 1281{
1281 struct task_struct *tsk; 1282 struct task_struct *tsk;
1283 const struct cred *cred = current_cred(), *tcred;
1282 int ret; 1284 int ret;
1283 1285
1284 if (pid) { 1286 if (pid) {
@@ -1288,14 +1290,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1288 rcu_read_unlock(); 1290 rcu_read_unlock();
1289 return -ESRCH; 1291 return -ESRCH;
1290 } 1292 }
1291 get_task_struct(tsk);
1292 rcu_read_unlock();
1293 1293
1294 if ((current->euid) && (current->euid != tsk->uid) 1294 tcred = __task_cred(tsk);
1295 && (current->euid != tsk->suid)) { 1295 if (cred->euid &&
1296 put_task_struct(tsk); 1296 cred->euid != tcred->uid &&
1297 cred->euid != tcred->suid) {
1298 rcu_read_unlock();
1297 return -EACCES; 1299 return -EACCES;
1298 } 1300 }
1301 get_task_struct(tsk);
1302 rcu_read_unlock();
1299 } else { 1303 } else {
1300 tsk = current; 1304 tsk = current;
1301 get_task_struct(tsk); 1305 get_task_struct(tsk);
@@ -2934,9 +2938,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2934 again: 2938 again:
2935 root = subsys->root; 2939 root = subsys->root;
2936 if (root == &rootnode) { 2940 if (root == &rootnode) {
2937 printk(KERN_INFO
2938 "Not cloning cgroup for unused subsystem %s\n",
2939 subsys->name);
2940 mutex_unlock(&cgroup_mutex); 2941 mutex_unlock(&cgroup_mutex);
2941 return 0; 2942 return 0;
2942 } 2943 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d9..d52e2ec1deb5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -454,16 +454,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
454} 454}
455 455
456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
457 unsigned len, cpumask_t *new_mask) 457 unsigned len, struct cpumask *new_mask)
458{ 458{
459 unsigned long *k; 459 unsigned long *k;
460 460
461 if (len < sizeof(cpumask_t)) 461 if (len < cpumask_size())
462 memset(new_mask, 0, sizeof(cpumask_t)); 462 memset(new_mask, 0, cpumask_size());
463 else if (len > sizeof(cpumask_t)) 463 else if (len > cpumask_size())
464 len = sizeof(cpumask_t); 464 len = cpumask_size();
465 465
466 k = cpus_addr(*new_mask); 466 k = cpumask_bits(new_mask);
467 return compat_get_bitmap(k, user_mask_ptr, len * 8); 467 return compat_get_bitmap(k, user_mask_ptr, len * 8);
468} 468}
469 469
@@ -471,40 +471,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
471 unsigned int len, 471 unsigned int len,
472 compat_ulong_t __user *user_mask_ptr) 472 compat_ulong_t __user *user_mask_ptr)
473{ 473{
474 cpumask_t new_mask; 474 cpumask_var_t new_mask;
475 int retval; 475 int retval;
476 476
477 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); 477 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
478 return -ENOMEM;
479
480 retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
478 if (retval) 481 if (retval)
479 return retval; 482 goto out;
480 483
481 return sched_setaffinity(pid, &new_mask); 484 retval = sched_setaffinity(pid, new_mask);
485out:
486 free_cpumask_var(new_mask);
487 return retval;
482} 488}
483 489
484asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 490asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
485 compat_ulong_t __user *user_mask_ptr) 491 compat_ulong_t __user *user_mask_ptr)
486{ 492{
487 int ret; 493 int ret;
488 cpumask_t mask; 494 cpumask_var_t mask;
489 unsigned long *k; 495 unsigned long *k;
490 unsigned int min_length = sizeof(cpumask_t); 496 unsigned int min_length = cpumask_size();
491 497
492 if (NR_CPUS <= BITS_PER_COMPAT_LONG) 498 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
493 min_length = sizeof(compat_ulong_t); 499 min_length = sizeof(compat_ulong_t);
494 500
495 if (len < min_length) 501 if (len < min_length)
496 return -EINVAL; 502 return -EINVAL;
497 503
498 ret = sched_getaffinity(pid, &mask); 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
505 return -ENOMEM;
506
507 ret = sched_getaffinity(pid, mask);
499 if (ret < 0) 508 if (ret < 0)
500 return ret; 509 goto out;
501 510
502 k = cpus_addr(mask); 511 k = cpumask_bits(mask);
503 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
504 if (ret) 513 if (ret == 0)
505 return ret; 514 ret = min_length;
506 515
507 return min_length; 516out:
517 free_cpumask_var(mask);
518 return ret;
508} 519}
509 520
510int get_compat_itimerspec(struct itimerspec *dst, 521int get_compat_itimerspec(struct itimerspec *dst,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bae131a1211b..47fff3b63cbf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,30 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27/*
28 * Represents all cpu's that are currently online.
29 */
30cpumask_t cpu_online_map __read_mostly;
31EXPORT_SYMBOL(cpu_online_map);
32
33#ifdef CONFIG_INIT_ALL_POSSIBLE
34cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
35#else
36cpumask_t cpu_possible_map __read_mostly;
37#endif
38EXPORT_SYMBOL(cpu_possible_map);
39
40#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
41/* Serializes the updates to cpu_online_map, cpu_present_map */ 19/* Serializes the updates to cpu_online_mask, cpu_present_mask */
42static DEFINE_MUTEX(cpu_add_remove_lock); 20static DEFINE_MUTEX(cpu_add_remove_lock);
43 21
44static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void)
65 cpu_hotplug.refcount = 0; 43 cpu_hotplug.refcount = 0;
66} 44}
67 45
68cpumask_t cpu_active_map;
69
70#ifdef CONFIG_HOTPLUG_CPU 46#ifdef CONFIG_HOTPLUG_CPU
71 47
72void get_online_cpus(void) 48void get_online_cpus(void)
@@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
97 73
98/* 74/*
99 * The following two API's must be used when attempting 75 * The following two API's must be used when attempting
100 * to serialize the updates to cpu_online_map, cpu_present_map. 76 * to serialize the updates to cpu_online_mask, cpu_present_mask.
101 */ 77 */
102void cpu_maps_update_begin(void) 78void cpu_maps_update_begin(void)
103{ 79{
@@ -218,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
218static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 194static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
219{ 195{
220 int err, nr_calls = 0; 196 int err, nr_calls = 0;
221 cpumask_t old_allowed, tmp; 197 cpumask_var_t old_allowed;
222 void *hcpu = (void *)(long)cpu; 198 void *hcpu = (void *)(long)cpu;
223 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 199 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
224 struct take_cpu_down_param tcd_param = { 200 struct take_cpu_down_param tcd_param = {
@@ -232,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
232 if (!cpu_online(cpu)) 208 if (!cpu_online(cpu))
233 return -EINVAL; 209 return -EINVAL;
234 210
211 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
212 return -ENOMEM;
213
235 cpu_hotplug_begin(); 214 cpu_hotplug_begin();
236 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
237 hcpu, -1, &nr_calls); 216 hcpu, -1, &nr_calls);
@@ -246,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
246 } 225 }
247 226
248 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
249 old_allowed = current->cpus_allowed; 228 cpumask_copy(old_allowed, &current->cpus_allowed);
250 cpus_setall(tmp); 229 set_cpus_allowed_ptr(current,
251 cpu_clear(cpu, tmp); 230 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
252 set_cpus_allowed_ptr(current, &tmp);
253 tmp = cpumask_of_cpu(cpu);
254 231
255 err = __stop_machine(take_cpu_down, &tcd_param, &tmp); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
256 if (err) { 233 if (err) {
257 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
258 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -278,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
278 check_for_tasks(cpu); 255 check_for_tasks(cpu);
279 256
280out_allowed: 257out_allowed:
281 set_cpus_allowed_ptr(current, &old_allowed); 258 set_cpus_allowed_ptr(current, old_allowed);
282out_release: 259out_release:
283 cpu_hotplug_done(); 260 cpu_hotplug_done();
284 if (!err) { 261 if (!err) {
@@ -286,6 +263,7 @@ out_release:
286 hcpu) == NOTIFY_BAD) 263 hcpu) == NOTIFY_BAD)
287 BUG(); 264 BUG();
288 } 265 }
266 free_cpumask_var(old_allowed);
289 return err; 267 return err;
290} 268}
291 269
@@ -304,7 +282,7 @@ int __ref cpu_down(unsigned int cpu)
304 282
305 /* 283 /*
306 * Make sure the all cpus did the reschedule and are not 284 * Make sure the all cpus did the reschedule and are not
307 * using stale version of the cpu_active_map. 285 * using stale version of the cpu_active_mask.
308 * This is not strictly necessary becuase stop_machine() 286 * This is not strictly necessary becuase stop_machine()
309 * that we run down the line already provides the required 287 * that we run down the line already provides the required
310 * synchronization. But it's really a side effect and we do not 288 * synchronization. But it's really a side effect and we do not
@@ -368,7 +346,7 @@ out_notify:
368int __cpuinit cpu_up(unsigned int cpu) 346int __cpuinit cpu_up(unsigned int cpu)
369{ 347{
370 int err = 0; 348 int err = 0;
371 if (!cpu_isset(cpu, cpu_possible_map)) { 349 if (!cpu_possible(cpu)) {
372 printk(KERN_ERR "can't online cpu %d because it is not " 350 printk(KERN_ERR "can't online cpu %d because it is not "
373 "configured as may-hotadd at boot time\n", cpu); 351 "configured as may-hotadd at boot time\n", cpu);
374#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 352#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -393,25 +371,25 @@ out:
393} 371}
394 372
395#ifdef CONFIG_PM_SLEEP_SMP 373#ifdef CONFIG_PM_SLEEP_SMP
396static cpumask_t frozen_cpus; 374static cpumask_var_t frozen_cpus;
397 375
398int disable_nonboot_cpus(void) 376int disable_nonboot_cpus(void)
399{ 377{
400 int cpu, first_cpu, error = 0; 378 int cpu, first_cpu, error = 0;
401 379
402 cpu_maps_update_begin(); 380 cpu_maps_update_begin();
403 first_cpu = first_cpu(cpu_online_map); 381 first_cpu = cpumask_first(cpu_online_mask);
404 /* We take down all of the non-boot CPUs in one shot to avoid races 382 /* We take down all of the non-boot CPUs in one shot to avoid races
405 * with the userspace trying to use the CPU hotplug at the same time 383 * with the userspace trying to use the CPU hotplug at the same time
406 */ 384 */
407 cpus_clear(frozen_cpus); 385 cpumask_clear(frozen_cpus);
408 printk("Disabling non-boot CPUs ...\n"); 386 printk("Disabling non-boot CPUs ...\n");
409 for_each_online_cpu(cpu) { 387 for_each_online_cpu(cpu) {
410 if (cpu == first_cpu) 388 if (cpu == first_cpu)
411 continue; 389 continue;
412 error = _cpu_down(cpu, 1); 390 error = _cpu_down(cpu, 1);
413 if (!error) { 391 if (!error) {
414 cpu_set(cpu, frozen_cpus); 392 cpumask_set_cpu(cpu, frozen_cpus);
415 printk("CPU%d is down\n", cpu); 393 printk("CPU%d is down\n", cpu);
416 } else { 394 } else {
417 printk(KERN_ERR "Error taking CPU%d down: %d\n", 395 printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -437,11 +415,11 @@ void __ref enable_nonboot_cpus(void)
437 /* Allow everyone to use the CPU hotplug again */ 415 /* Allow everyone to use the CPU hotplug again */
438 cpu_maps_update_begin(); 416 cpu_maps_update_begin();
439 cpu_hotplug_disabled = 0; 417 cpu_hotplug_disabled = 0;
440 if (cpus_empty(frozen_cpus)) 418 if (cpumask_empty(frozen_cpus))
441 goto out; 419 goto out;
442 420
443 printk("Enabling non-boot CPUs ...\n"); 421 printk("Enabling non-boot CPUs ...\n");
444 for_each_cpu_mask_nr(cpu, frozen_cpus) { 422 for_each_cpu(cpu, frozen_cpus) {
445 error = _cpu_up(cpu, 1); 423 error = _cpu_up(cpu, 1);
446 if (!error) { 424 if (!error) {
447 printk("CPU%d is up\n", cpu); 425 printk("CPU%d is up\n", cpu);
@@ -449,10 +427,18 @@ void __ref enable_nonboot_cpus(void)
449 } 427 }
450 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 428 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
451 } 429 }
452 cpus_clear(frozen_cpus); 430 cpumask_clear(frozen_cpus);
453out: 431out:
454 cpu_maps_update_done(); 432 cpu_maps_update_done();
455} 433}
434
435static int alloc_frozen_cpus(void)
436{
437 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
438 return -ENOMEM;
439 return 0;
440}
441core_initcall(alloc_frozen_cpus);
456#endif /* CONFIG_PM_SLEEP_SMP */ 442#endif /* CONFIG_PM_SLEEP_SMP */
457 443
458/** 444/**
@@ -468,7 +454,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
468 unsigned long val = CPU_STARTING; 454 unsigned long val = CPU_STARTING;
469 455
470#ifdef CONFIG_PM_SLEEP_SMP 456#ifdef CONFIG_PM_SLEEP_SMP
471 if (cpu_isset(cpu, frozen_cpus)) 457 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
472 val = CPU_STARTING_FROZEN; 458 val = CPU_STARTING_FROZEN;
473#endif /* CONFIG_PM_SLEEP_SMP */ 459#endif /* CONFIG_PM_SLEEP_SMP */
474 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 460 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -480,7 +466,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
480 * cpu_bit_bitmap[] is a special, "compressed" data structure that 466 * cpu_bit_bitmap[] is a special, "compressed" data structure that
481 * represents all NR_CPUS bits binary values of 1<<nr. 467 * represents all NR_CPUS bits binary values of 1<<nr.
482 * 468 *
483 * It is used by cpumask_of_cpu() to get a constant address to a CPU 469 * It is used by cpumask_of() to get a constant address to a CPU
484 * mask value that has a single bit set only. 470 * mask value that has a single bit set only.
485 */ 471 */
486 472
@@ -503,3 +489,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
503 489
504const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 490const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
505EXPORT_SYMBOL(cpu_all_bits); 491EXPORT_SYMBOL(cpu_all_bits);
492
493#ifdef CONFIG_INIT_ALL_POSSIBLE
494static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
495 = CPU_BITS_ALL;
496#else
497static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
498#endif
499const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
500EXPORT_SYMBOL(cpu_possible_mask);
501
502static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
503const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
504EXPORT_SYMBOL(cpu_online_mask);
505
506static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
507const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
508EXPORT_SYMBOL(cpu_present_mask);
509
510static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
511const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
512EXPORT_SYMBOL(cpu_active_mask);
513
514void set_cpu_possible(unsigned int cpu, bool possible)
515{
516 if (possible)
517 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
518 else
519 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
520}
521
522void set_cpu_present(unsigned int cpu, bool present)
523{
524 if (present)
525 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
526 else
527 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
528}
529
530void set_cpu_online(unsigned int cpu, bool online)
531{
532 if (online)
533 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
534 else
535 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
536}
537
538void set_cpu_active(unsigned int cpu, bool active)
539{
540 if (active)
541 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
542 else
543 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
544}
545
546void init_cpu_present(const struct cpumask *src)
547{
548 cpumask_copy(to_cpumask(cpu_present_bits), src);
549}
550
551void init_cpu_possible(const struct cpumask *src)
552{
553 cpumask_copy(to_cpumask(cpu_possible_bits), src);
554}
555
556void init_cpu_online(const struct cpumask *src)
557{
558 cpumask_copy(to_cpumask(cpu_online_bits), src);
559}
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000000..ff7bc071991c
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,588 @@
1/* Task credentials management - see Documentation/credentials.txt
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#include <linux/module.h>
12#include <linux/cred.h>
13#include <linux/sched.h>
14#include <linux/key.h>
15#include <linux/keyctl.h>
16#include <linux/init_task.h>
17#include <linux/security.h>
18#include <linux/cn_proc.h>
19#include "cred-internals.h"
20
21static struct kmem_cache *cred_jar;
22
23/*
24 * The common credentials for the initial task's thread group
25 */
26#ifdef CONFIG_KEYS
27static struct thread_group_cred init_tgcred = {
28 .usage = ATOMIC_INIT(2),
29 .tgid = 0,
30 .lock = SPIN_LOCK_UNLOCKED,
31};
32#endif
33
34/*
35 * The initial credentials for the initial task
36 */
37struct cred init_cred = {
38 .usage = ATOMIC_INIT(4),
39 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET,
42 .cap_effective = CAP_INIT_EFF_SET,
43 .cap_bset = CAP_INIT_BSET,
44 .user = INIT_USER,
45 .group_info = &init_groups,
46#ifdef CONFIG_KEYS
47 .tgcred = &init_tgcred,
48#endif
49};
50
51/*
52 * Dispose of the shared task group credentials
53 */
54#ifdef CONFIG_KEYS
55static void release_tgcred_rcu(struct rcu_head *rcu)
56{
57 struct thread_group_cred *tgcred =
58 container_of(rcu, struct thread_group_cred, rcu);
59
60 BUG_ON(atomic_read(&tgcred->usage) != 0);
61
62 key_put(tgcred->session_keyring);
63 key_put(tgcred->process_keyring);
64 kfree(tgcred);
65}
66#endif
67
68/*
69 * Release a set of thread group credentials.
70 */
71static void release_tgcred(struct cred *cred)
72{
73#ifdef CONFIG_KEYS
74 struct thread_group_cred *tgcred = cred->tgcred;
75
76 if (atomic_dec_and_test(&tgcred->usage))
77 call_rcu(&tgcred->rcu, release_tgcred_rcu);
78#endif
79}
80
81/*
82 * The RCU callback to actually dispose of a set of credentials
83 */
84static void put_cred_rcu(struct rcu_head *rcu)
85{
86 struct cred *cred = container_of(rcu, struct cred, rcu);
87
88 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage));
91
92 security_cred_free(cred);
93 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth);
95 release_tgcred(cred);
96 put_group_info(cred->group_info);
97 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred);
99}
100
101/**
102 * __put_cred - Destroy a set of credentials
103 * @cred: The record to release
104 *
105 * Destroy a set of credentials on which no references remain.
106 */
107void __put_cred(struct cred *cred)
108{
109 BUG_ON(atomic_read(&cred->usage) != 0);
110
111 call_rcu(&cred->rcu, put_cred_rcu);
112}
113EXPORT_SYMBOL(__put_cred);
114
115/**
116 * prepare_creds - Prepare a new set of credentials for modification
117 *
118 * Prepare a new set of task credentials for modification. A task's creds
119 * shouldn't generally be modified directly, therefore this function is used to
120 * prepare a new copy, which the caller then modifies and then commits by
121 * calling commit_creds().
122 *
123 * Preparation involves making a copy of the objective creds for modification.
124 *
125 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
126 *
127 * Call commit_creds() or abort_creds() to clean up.
128 */
129struct cred *prepare_creds(void)
130{
131 struct task_struct *task = current;
132 const struct cred *old;
133 struct cred *new;
134
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1);
136
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new)
139 return NULL;
140
141 old = task->cred;
142 memcpy(new, old, sizeof(struct cred));
143
144 atomic_set(&new->usage, 1);
145 get_group_info(new->group_info);
146 get_uid(new->user);
147
148#ifdef CONFIG_KEYS
149 key_get(new->thread_keyring);
150 key_get(new->request_key_auth);
151 atomic_inc(&new->tgcred->usage);
152#endif
153
154#ifdef CONFIG_SECURITY
155 new->security = NULL;
156#endif
157
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error;
160 return new;
161
162error:
163 abort_creds(new);
164 return NULL;
165}
166EXPORT_SYMBOL(prepare_creds);
167
168/*
169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex
171 */
172struct cred *prepare_exec_creds(void)
173{
174 struct thread_group_cred *tgcred = NULL;
175 struct cred *new;
176
177#ifdef CONFIG_KEYS
178 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
179 if (!tgcred)
180 return NULL;
181#endif
182
183 new = prepare_creds();
184 if (!new) {
185 kfree(tgcred);
186 return new;
187 }
188
189#ifdef CONFIG_KEYS
190 /* newly exec'd tasks don't get a thread keyring */
191 key_put(new->thread_keyring);
192 new->thread_keyring = NULL;
193
194 /* create a new per-thread-group creds for all this set of threads to
195 * share */
196 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
197
198 atomic_set(&tgcred->usage, 1);
199 spin_lock_init(&tgcred->lock);
200
201 /* inherit the session keyring; new process keyring */
202 key_get(tgcred->session_keyring);
203 tgcred->process_keyring = NULL;
204
205 release_tgcred(new);
206 new->tgcred = tgcred;
207#endif
208
209 return new;
210}
211
212/*
213 * prepare new credentials for the usermode helper dispatcher
214 */
215struct cred *prepare_usermodehelper_creds(void)
216{
217#ifdef CONFIG_KEYS
218 struct thread_group_cred *tgcred = NULL;
219#endif
220 struct cred *new;
221
222#ifdef CONFIG_KEYS
223 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
224 if (!tgcred)
225 return NULL;
226#endif
227
228 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
229 if (!new)
230 return NULL;
231
232 memcpy(new, &init_cred, sizeof(struct cred));
233
234 atomic_set(&new->usage, 1);
235 get_group_info(new->group_info);
236 get_uid(new->user);
237
238#ifdef CONFIG_KEYS
239 new->thread_keyring = NULL;
240 new->request_key_auth = NULL;
241 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
242
243 atomic_set(&tgcred->usage, 1);
244 spin_lock_init(&tgcred->lock);
245 new->tgcred = tgcred;
246#endif
247
248#ifdef CONFIG_SECURITY
249 new->security = NULL;
250#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error;
253
254 BUG_ON(atomic_read(&new->usage) != 1);
255 return new;
256
257error:
258 put_cred(new);
259 return NULL;
260}
261
262/*
263 * Copy credentials for the new process created by fork()
264 *
265 * We share if we can, but under some circumstances we have to generate a new
266 * set.
267 *
268 * The new process gets the current process's subjective credentials as its
269 * objective and subjective credentials
270 */
271int copy_creds(struct task_struct *p, unsigned long clone_flags)
272{
273#ifdef CONFIG_KEYS
274 struct thread_group_cred *tgcred;
275#endif
276 struct cred *new;
277 int ret;
278
279 mutex_init(&p->cred_exec_mutex);
280
281 if (
282#ifdef CONFIG_KEYS
283 !p->cred->thread_keyring &&
284#endif
285 clone_flags & CLONE_THREAD
286 ) {
287 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred);
289 atomic_inc(&p->cred->user->processes);
290 return 0;
291 }
292
293 new = prepare_creds();
294 if (!new)
295 return -ENOMEM;
296
297 if (clone_flags & CLONE_NEWUSER) {
298 ret = create_user_ns(new);
299 if (ret < 0)
300 goto error_put;
301 }
302
303#ifdef CONFIG_KEYS
304 /* new threads get their own thread keyrings if their parent already
305 * had one */
306 if (new->thread_keyring) {
307 key_put(new->thread_keyring);
308 new->thread_keyring = NULL;
309 if (clone_flags & CLONE_THREAD)
310 install_thread_keyring_to_cred(new);
311 }
312
313 /* we share the process and session keyrings between all the threads in
314 * a process - this is slightly icky as we violate COW credentials a
315 * bit */
316 if (!(clone_flags & CLONE_THREAD)) {
317 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
318 if (!tgcred) {
319 ret = -ENOMEM;
320 goto error_put;
321 }
322 atomic_set(&tgcred->usage, 1);
323 spin_lock_init(&tgcred->lock);
324 tgcred->process_keyring = NULL;
325 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
326
327 release_tgcred(new);
328 new->tgcred = tgcred;
329 }
330#endif
331
332 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new);
334 return 0;
335
336error_put:
337 put_cred(new);
338 return ret;
339}
340
341/**
342 * commit_creds - Install new credentials upon the current task
343 * @new: The credentials to be assigned
344 *
345 * Install a new set of credentials to the current task, using RCU to replace
346 * the old set. Both the objective and the subjective credentials pointers are
347 * updated. This function may not be called if the subjective credentials are
348 * in an overridden state.
349 *
350 * This function eats the caller's reference to the new credentials.
351 *
352 * Always returns 0 thus allowing this function to be tail-called at the end
353 * of, say, sys_setgid().
354 */
355int commit_creds(struct cred *new)
356{
357 struct task_struct *task = current;
358 const struct cred *old;
359
360 BUG_ON(task->cred != task->real_cred);
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2);
362 BUG_ON(atomic_read(&new->usage) < 1);
363
364 old = task->real_cred;
365 security_commit_creds(new, old);
366
367 get_cred(new); /* we will require a ref for the subj creds too */
368
369 /* dumpability changes */
370 if (old->euid != new->euid ||
371 old->egid != new->egid ||
372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0;
377 smp_wmb();
378 }
379
380 /* alter the thread keyring */
381 if (new->fsuid != old->fsuid)
382 key_fsuid_changed(task);
383 if (new->fsgid != old->fsgid)
384 key_fsgid_changed(task);
385
386 /* do it
387 * - What if a process setreuid()'s and this brings the
388 * new uid over his NPROC rlimit? We can check this now
389 * cheaply with the new uid cache, so if it matters
390 * we should be checking for it. -DaveM
391 */
392 if (new->user != old->user)
393 atomic_inc(&new->user->processes);
394 rcu_assign_pointer(task->real_cred, new);
395 rcu_assign_pointer(task->cred, new);
396 if (new->user != old->user)
397 atomic_dec(&old->user->processes);
398
399 sched_switch_user(task);
400
401 /* send notifications */
402 if (new->uid != old->uid ||
403 new->euid != old->euid ||
404 new->suid != old->suid ||
405 new->fsuid != old->fsuid)
406 proc_id_connector(task, PROC_EVENT_UID);
407
408 if (new->gid != old->gid ||
409 new->egid != old->egid ||
410 new->sgid != old->sgid ||
411 new->fsgid != old->fsgid)
412 proc_id_connector(task, PROC_EVENT_GID);
413
414 /* release the old obj and subj refs both */
415 put_cred(old);
416 put_cred(old);
417 return 0;
418}
419EXPORT_SYMBOL(commit_creds);
420
421/**
422 * abort_creds - Discard a set of credentials and unlock the current task
423 * @new: The credentials that were going to be applied
424 *
425 * Discard a set of credentials that were under construction and unlock the
426 * current task.
427 */
428void abort_creds(struct cred *new)
429{
430 BUG_ON(atomic_read(&new->usage) < 1);
431 put_cred(new);
432}
433EXPORT_SYMBOL(abort_creds);
434
435/**
436 * override_creds - Override the current process's subjective credentials
437 * @new: The credentials to be assigned
438 *
439 * Install a set of temporary override subjective credentials on the current
440 * process, returning the old set for later reversion.
441 */
442const struct cred *override_creds(const struct cred *new)
443{
444 const struct cred *old = current->cred;
445
446 rcu_assign_pointer(current->cred, get_cred(new));
447 return old;
448}
449EXPORT_SYMBOL(override_creds);
450
451/**
452 * revert_creds - Revert a temporary subjective credentials override
453 * @old: The credentials to be restored
454 *
455 * Revert a temporary set of override subjective credentials to an old set,
456 * discarding the override set.
457 */
458void revert_creds(const struct cred *old)
459{
460 const struct cred *override = current->cred;
461
462 rcu_assign_pointer(current->cred, old);
463 put_cred(override);
464}
465EXPORT_SYMBOL(revert_creds);
466
467/*
468 * initialise the credentials stuff
469 */
470void __init cred_init(void)
471{
472 /* allocate a slab in which we can store credentials */
473 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
474 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
475}
476
477/**
478 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
479 * @daemon: A userspace daemon to be used as a reference
480 *
481 * Prepare a set of credentials for a kernel service. This can then be used to
482 * override a task's own credentials so that work can be done on behalf of that
483 * task that requires a different subjective context.
484 *
485 * @daemon is used to provide a base for the security record, but can be NULL.
486 * If @daemon is supplied, then the security data will be derived from that;
487 * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
488 *
489 * The caller may change these controls afterwards if desired.
490 *
491 * Returns the new credentials or NULL if out of memory.
492 *
493 * Does not take, and does not return holding current->cred_replace_mutex.
494 */
495struct cred *prepare_kernel_cred(struct task_struct *daemon)
496{
497 const struct cred *old;
498 struct cred *new;
499
500 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
501 if (!new)
502 return NULL;
503
504 if (daemon)
505 old = get_task_cred(daemon);
506 else
507 old = get_cred(&init_cred);
508
509 get_uid(new->user);
510 get_group_info(new->group_info);
511
512#ifdef CONFIG_KEYS
513 atomic_inc(&init_tgcred.usage);
514 new->tgcred = &init_tgcred;
515 new->request_key_auth = NULL;
516 new->thread_keyring = NULL;
517 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
518#endif
519
520#ifdef CONFIG_SECURITY
521 new->security = NULL;
522#endif
523 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
524 goto error;
525
526 atomic_set(&new->usage, 1);
527 put_cred(old);
528 return new;
529
530error:
531 put_cred(new);
532 return NULL;
533}
534EXPORT_SYMBOL(prepare_kernel_cred);
535
536/**
537 * set_security_override - Set the security ID in a set of credentials
538 * @new: The credentials to alter
539 * @secid: The LSM security ID to set
540 *
541 * Set the LSM security ID in a set of credentials so that the subjective
542 * security is overridden when an alternative set of credentials is used.
543 */
544int set_security_override(struct cred *new, u32 secid)
545{
546 return security_kernel_act_as(new, secid);
547}
548EXPORT_SYMBOL(set_security_override);
549
550/**
551 * set_security_override_from_ctx - Set the security ID in a set of credentials
552 * @new: The credentials to alter
553 * @secctx: The LSM security context to generate the security ID from.
554 *
555 * Set the LSM security ID in a set of credentials so that the subjective
556 * security is overridden when an alternative set of credentials is used. The
557 * security ID is specified in string form as a security context to be
558 * interpreted by the LSM.
559 */
560int set_security_override_from_ctx(struct cred *new, const char *secctx)
561{
562 u32 secid;
563 int ret;
564
565 ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
566 if (ret < 0)
567 return ret;
568
569 return set_security_override(new, secid);
570}
571EXPORT_SYMBOL(set_security_override_from_ctx);
572
573/**
574 * set_create_files_as - Set the LSM file create context in a set of credentials
575 * @new: The credentials to alter
576 * @inode: The inode to take the context from
577 *
578 * Change the LSM file creation context in a set of credentials to be the same
579 * as the object context of the specified inode, so that the new inodes have
580 * the same MAC context as that inode.
581 */
582int set_create_files_as(struct cred *new, struct inode *inode)
583{
584 new->fsuid = inode->i_uid;
585 new->fsgid = inode->i_gid;
586 return security_kernel_create_files_as(new, inode);
587}
588EXPORT_SYMBOL(set_create_files_as);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/exit.c b/kernel/exit.c
index 61ba5b4b10cf..c9e5a1c14e08 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,12 +46,14 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/init_task.h>
49#include <trace/sched.h> 50#include <trace/sched.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52#include <asm/unistd.h> 53#include <asm/unistd.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
56#include "cred-internals.h"
55 57
56DEFINE_TRACE(sched_process_free); 58DEFINE_TRACE(sched_process_free);
57DEFINE_TRACE(sched_process_exit); 59DEFINE_TRACE(sched_process_exit);
@@ -168,7 +170,10 @@ void release_task(struct task_struct * p)
168 int zap_leader; 170 int zap_leader;
169repeat: 171repeat:
170 tracehook_prepare_release_task(p); 172 tracehook_prepare_release_task(p);
171 atomic_dec(&p->user->processes); 173 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials */
175 atomic_dec(&__task_cred(p)->user->processes);
176
172 proc_flush_task(p); 177 proc_flush_task(p);
173 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
174 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
@@ -343,12 +348,12 @@ static void reparent_to_kthreadd(void)
343 /* cpus_allowed? */ 348 /* cpus_allowed? */
344 /* rt_priority? */ 349 /* rt_priority? */
345 /* signals? */ 350 /* signals? */
346 security_task_reparent_to_init(current);
347 memcpy(current->signal->rlim, init_task.signal->rlim, 351 memcpy(current->signal->rlim, init_task.signal->rlim,
348 sizeof(current->signal->rlim)); 352 sizeof(current->signal->rlim));
349 atomic_inc(&(INIT_USER->__count)); 353
354 atomic_inc(&init_cred.usage);
355 commit_creds(&init_cred);
350 write_unlock_irq(&tasklist_lock); 356 write_unlock_irq(&tasklist_lock);
351 switch_uid(INIT_USER);
352} 357}
353 358
354void __set_special_pids(struct pid *pid) 359void __set_special_pids(struct pid *pid)
@@ -1032,8 +1037,6 @@ NORET_TYPE void do_exit(long code)
1032 * task into the wait for ever nirwana as well. 1037 * task into the wait for ever nirwana as well.
1033 */ 1038 */
1034 tsk->flags |= PF_EXITPIDONE; 1039 tsk->flags |= PF_EXITPIDONE;
1035 if (tsk->io_context)
1036 exit_io_context();
1037 set_current_state(TASK_UNINTERRUPTIBLE); 1040 set_current_state(TASK_UNINTERRUPTIBLE);
1038 schedule(); 1041 schedule();
1039 } 1042 }
@@ -1082,7 +1085,6 @@ NORET_TYPE void do_exit(long code)
1082 check_stack_usage(); 1085 check_stack_usage();
1083 exit_thread(); 1086 exit_thread();
1084 cgroup_exit(tsk, 1); 1087 cgroup_exit(tsk, 1);
1085 exit_keys(tsk);
1086 1088
1087 if (group_dead && tsk->signal->leader) 1089 if (group_dead && tsk->signal->leader)
1088 disassociate_ctty(1); 1090 disassociate_ctty(1);
@@ -1266,12 +1268,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1266 unsigned long state; 1268 unsigned long state;
1267 int retval, status, traced; 1269 int retval, status, traced;
1268 pid_t pid = task_pid_vnr(p); 1270 pid_t pid = task_pid_vnr(p);
1271 uid_t uid = __task_cred(p)->uid;
1269 1272
1270 if (!likely(options & WEXITED)) 1273 if (!likely(options & WEXITED))
1271 return 0; 1274 return 0;
1272 1275
1273 if (unlikely(options & WNOWAIT)) { 1276 if (unlikely(options & WNOWAIT)) {
1274 uid_t uid = p->uid;
1275 int exit_code = p->exit_code; 1277 int exit_code = p->exit_code;
1276 int why, status; 1278 int why, status;
1277 1279
@@ -1392,7 +1394,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1392 if (!retval && infop) 1394 if (!retval && infop)
1393 retval = put_user(pid, &infop->si_pid); 1395 retval = put_user(pid, &infop->si_pid);
1394 if (!retval && infop) 1396 if (!retval && infop)
1395 retval = put_user(p->uid, &infop->si_uid); 1397 retval = put_user(uid, &infop->si_uid);
1396 if (!retval) 1398 if (!retval)
1397 retval = pid; 1399 retval = pid;
1398 1400
@@ -1457,7 +1459,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1457 if (!unlikely(options & WNOWAIT)) 1459 if (!unlikely(options & WNOWAIT))
1458 p->exit_code = 0; 1460 p->exit_code = 0;
1459 1461
1460 uid = p->uid; 1462 /* don't need the RCU readlock here as we're holding a spinlock */
1463 uid = __task_cred(p)->uid;
1461unlock_sig: 1464unlock_sig:
1462 spin_unlock_irq(&p->sighand->siglock); 1465 spin_unlock_irq(&p->sighand->siglock);
1463 if (!exit_code) 1466 if (!exit_code)
@@ -1531,10 +1534,10 @@ static int wait_task_continued(struct task_struct *p, int options,
1531 } 1534 }
1532 if (!unlikely(options & WNOWAIT)) 1535 if (!unlikely(options & WNOWAIT))
1533 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1536 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1537 uid = __task_cred(p)->uid;
1534 spin_unlock_irq(&p->sighand->siglock); 1538 spin_unlock_irq(&p->sighand->siglock);
1535 1539
1536 pid = task_pid_vnr(p); 1540 pid = task_pid_vnr(p);
1537 uid = p->uid;
1538 get_task_struct(p); 1541 get_task_struct(p);
1539 read_unlock(&tasklist_lock); 1542 read_unlock(&tasklist_lock);
1540 1543
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b93da72d4a2..43cbf30669e6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -151,9 +151,8 @@ void __put_task_struct(struct task_struct *tsk)
151 WARN_ON(atomic_read(&tsk->usage)); 151 WARN_ON(atomic_read(&tsk->usage));
152 WARN_ON(tsk == current); 152 WARN_ON(tsk == current);
153 153
154 security_task_free(tsk); 154 put_cred(tsk->real_cred);
155 free_uid(tsk->user); 155 put_cred(tsk->cred);
156 put_group_info(tsk->group_info);
157 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
158 157
159 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -416,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
416 set_mm_counter(mm, file_rss, 0); 415 set_mm_counter(mm, file_rss, 0);
417 set_mm_counter(mm, anon_rss, 0); 416 set_mm_counter(mm, anon_rss, 0);
418 spin_lock_init(&mm->page_table_lock); 417 spin_lock_init(&mm->page_table_lock);
419 rwlock_init(&mm->ioctx_list_lock); 418 spin_lock_init(&mm->ioctx_lock);
420 mm->ioctx_list = NULL; 419 INIT_HLIST_HEAD(&mm->ioctx_list);
421 mm->free_area_cache = TASK_UNMAPPED_BASE; 420 mm->free_area_cache = TASK_UNMAPPED_BASE;
422 mm->cached_hole_size = ~0UL; 421 mm->cached_hole_size = ~0UL;
423 mm_init_owner(mm, p); 422 mm_init_owner(mm, p);
@@ -822,12 +821,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
822 if (!sig) 821 if (!sig)
823 return -ENOMEM; 822 return -ENOMEM;
824 823
825 ret = copy_thread_group_keys(tsk);
826 if (ret < 0) {
827 kmem_cache_free(signal_cachep, sig);
828 return ret;
829 }
830
831 atomic_set(&sig->count, 1); 824 atomic_set(&sig->count, 1);
832 atomic_set(&sig->live, 1); 825 atomic_set(&sig->live, 1);
833 init_waitqueue_head(&sig->wait_chldexit); 826 init_waitqueue_head(&sig->wait_chldexit);
@@ -872,7 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
872void __cleanup_signal(struct signal_struct *sig) 865void __cleanup_signal(struct signal_struct *sig)
873{ 866{
874 thread_group_cputime_free(sig); 867 thread_group_cputime_free(sig);
875 exit_thread_group_keys(sig);
876 tty_kref_put(sig->tty); 868 tty_kref_put(sig->tty);
877 kmem_cache_free(signal_cachep, sig); 869 kmem_cache_free(signal_cachep, sig);
878} 870}
@@ -988,16 +980,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
988 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 980 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
989#endif 981#endif
990 retval = -EAGAIN; 982 retval = -EAGAIN;
991 if (atomic_read(&p->user->processes) >= 983 if (atomic_read(&p->real_cred->user->processes) >=
992 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 984 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
993 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 985 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
994 p->user != current->nsproxy->user_ns->root_user) 986 p->real_cred->user != INIT_USER)
995 goto bad_fork_free; 987 goto bad_fork_free;
996 } 988 }
997 989
998 atomic_inc(&p->user->__count); 990 retval = copy_creds(p, clone_flags);
999 atomic_inc(&p->user->processes); 991 if (retval < 0)
1000 get_group_info(p->group_info); 992 goto bad_fork_free;
1001 993
1002 /* 994 /*
1003 * If multiple threads are within copy_process(), then this check 995 * If multiple threads are within copy_process(), then this check
@@ -1052,10 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1052 do_posix_clock_monotonic_gettime(&p->start_time); 1044 do_posix_clock_monotonic_gettime(&p->start_time);
1053 p->real_start_time = p->start_time; 1045 p->real_start_time = p->start_time;
1054 monotonic_to_bootbased(&p->real_start_time); 1046 monotonic_to_bootbased(&p->real_start_time);
1055#ifdef CONFIG_SECURITY
1056 p->security = NULL;
1057#endif
1058 p->cap_bset = current->cap_bset;
1059 p->io_context = NULL; 1047 p->io_context = NULL;
1060 p->audit_context = NULL; 1048 p->audit_context = NULL;
1061 cgroup_fork(p); 1049 cgroup_fork(p);
@@ -1096,14 +1084,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1096#ifdef CONFIG_DEBUG_MUTEXES 1084#ifdef CONFIG_DEBUG_MUTEXES
1097 p->blocked_on = NULL; /* not blocked yet */ 1085 p->blocked_on = NULL; /* not blocked yet */
1098#endif 1086#endif
1087 if (unlikely(ptrace_reparented(current)))
1088 ptrace_fork(p, clone_flags);
1099 1089
1100 /* Perform scheduler related setup. Assign this task to a CPU. */ 1090 /* Perform scheduler related setup. Assign this task to a CPU. */
1101 sched_fork(p, clone_flags); 1091 sched_fork(p, clone_flags);
1102 1092
1103 if ((retval = security_task_alloc(p)))
1104 goto bad_fork_cleanup_policy;
1105 if ((retval = audit_alloc(p))) 1093 if ((retval = audit_alloc(p)))
1106 goto bad_fork_cleanup_security; 1094 goto bad_fork_cleanup_policy;
1107 /* copy all the process information */ 1095 /* copy all the process information */
1108 if ((retval = copy_semundo(clone_flags, p))) 1096 if ((retval = copy_semundo(clone_flags, p)))
1109 goto bad_fork_cleanup_audit; 1097 goto bad_fork_cleanup_audit;
@@ -1117,10 +1105,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1117 goto bad_fork_cleanup_sighand; 1105 goto bad_fork_cleanup_sighand;
1118 if ((retval = copy_mm(clone_flags, p))) 1106 if ((retval = copy_mm(clone_flags, p)))
1119 goto bad_fork_cleanup_signal; 1107 goto bad_fork_cleanup_signal;
1120 if ((retval = copy_keys(clone_flags, p)))
1121 goto bad_fork_cleanup_mm;
1122 if ((retval = copy_namespaces(clone_flags, p))) 1108 if ((retval = copy_namespaces(clone_flags, p)))
1123 goto bad_fork_cleanup_keys; 1109 goto bad_fork_cleanup_mm;
1124 if ((retval = copy_io(clone_flags, p))) 1110 if ((retval = copy_io(clone_flags, p)))
1125 goto bad_fork_cleanup_namespaces; 1111 goto bad_fork_cleanup_namespaces;
1126 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1112 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1289,8 +1275,6 @@ bad_fork_cleanup_io:
1289 put_io_context(p->io_context); 1275 put_io_context(p->io_context);
1290bad_fork_cleanup_namespaces: 1276bad_fork_cleanup_namespaces:
1291 exit_task_namespaces(p); 1277 exit_task_namespaces(p);
1292bad_fork_cleanup_keys:
1293 exit_keys(p);
1294bad_fork_cleanup_mm: 1278bad_fork_cleanup_mm:
1295 if (p->mm) 1279 if (p->mm)
1296 mmput(p->mm); 1280 mmput(p->mm);
@@ -1306,8 +1290,6 @@ bad_fork_cleanup_semundo:
1306 exit_sem(p); 1290 exit_sem(p);
1307bad_fork_cleanup_audit: 1291bad_fork_cleanup_audit:
1308 audit_free(p); 1292 audit_free(p);
1309bad_fork_cleanup_security:
1310 security_task_free(p);
1311bad_fork_cleanup_policy: 1293bad_fork_cleanup_policy:
1312#ifdef CONFIG_NUMA 1294#ifdef CONFIG_NUMA
1313 mpol_put(p->mempolicy); 1295 mpol_put(p->mempolicy);
@@ -1320,9 +1302,9 @@ bad_fork_cleanup_cgroup:
1320bad_fork_cleanup_put_domain: 1302bad_fork_cleanup_put_domain:
1321 module_put(task_thread_info(p)->exec_domain->module); 1303 module_put(task_thread_info(p)->exec_domain->module);
1322bad_fork_cleanup_count: 1304bad_fork_cleanup_count:
1323 put_group_info(p->group_info); 1305 atomic_dec(&p->cred->user->processes);
1324 atomic_dec(&p->user->processes); 1306 put_cred(p->real_cred);
1325 free_uid(p->user); 1307 put_cred(p->cred);
1326bad_fork_free: 1308bad_fork_free:
1327 free_task(p); 1309 free_task(p);
1328fork_out: 1310fork_out:
@@ -1366,6 +1348,21 @@ long do_fork(unsigned long clone_flags,
1366 long nr; 1348 long nr;
1367 1349
1368 /* 1350 /*
1351 * Do some preliminary argument and permissions checking before we
1352 * actually start allocating stuff
1353 */
1354 if (clone_flags & CLONE_NEWUSER) {
1355 if (clone_flags & CLONE_THREAD)
1356 return -EINVAL;
1357 /* hopefully this check will go away when userns support is
1358 * complete
1359 */
1360 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1361 !capable(CAP_SETGID))
1362 return -EPERM;
1363 }
1364
1365 /*
1369 * We hope to recycle these flags after 2.6.26 1366 * We hope to recycle these flags after 2.6.26
1370 */ 1367 */
1371 if (unlikely(clone_flags & CLONE_STOPPED)) { 1368 if (unlikely(clone_flags & CLONE_STOPPED)) {
@@ -1613,8 +1610,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1613 err = -EINVAL; 1610 err = -EINVAL;
1614 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1611 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1615 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1612 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1616 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| 1613 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1617 CLONE_NEWNET))
1618 goto bad_unshare_out; 1614 goto bad_unshare_out;
1619 1615
1620 /* 1616 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index e10c5c8786a6..7c6cbabe52b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -405,13 +406,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
405static struct task_struct * futex_find_get_task(pid_t pid) 406static struct task_struct * futex_find_get_task(pid_t pid)
406{ 407{
407 struct task_struct *p; 408 struct task_struct *p;
409 const struct cred *cred = current_cred(), *pcred;
408 410
409 rcu_read_lock(); 411 rcu_read_lock();
410 p = find_task_by_vpid(pid); 412 p = find_task_by_vpid(pid);
411 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 413 if (!p) {
412 p = ERR_PTR(-ESRCH); 414 p = ERR_PTR(-ESRCH);
413 else 415 } else {
414 get_task_struct(p); 416 pcred = __task_cred(p);
417 if (cred->euid != pcred->euid &&
418 cred->euid != pcred->uid)
419 p = ERR_PTR(-ESRCH);
420 else
421 get_task_struct(p);
422 }
415 423
416 rcu_read_unlock(); 424 rcu_read_unlock();
417 425
@@ -573,7 +581,7 @@ static void wake_futex(struct futex_q *q)
573 * The lock in wake_up_all() is a crucial memory barrier after the 581 * The lock in wake_up_all() is a crucial memory barrier after the
574 * plist_del() and also before assigning to q->lock_ptr. 582 * plist_del() and also before assigning to q->lock_ptr.
575 */ 583 */
576 wake_up_all(&q->waiters); 584 wake_up(&q->waiter);
577 /* 585 /*
578 * The waiting task can free the futex_q as soon as this is written, 586 * The waiting task can free the futex_q as soon as this is written,
579 * without taking any locks. This must come last. 587 * without taking any locks. This must come last.
@@ -930,7 +938,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
930{ 938{
931 struct futex_hash_bucket *hb; 939 struct futex_hash_bucket *hb;
932 940
933 init_waitqueue_head(&q->waiters); 941 init_waitqueue_head(&q->waiter);
934 942
935 get_futex_key_refs(&q->key); 943 get_futex_key_refs(&q->key);
936 hb = hash_futex(&q->key); 944 hb = hash_futex(&q->key);
@@ -1142,12 +1150,13 @@ handle_fault:
1142 * In case we must use restart_block to restart a futex_wait, 1150 * In case we must use restart_block to restart a futex_wait,
1143 * we encode in the 'flags' shared capability 1151 * we encode in the 'flags' shared capability
1144 */ 1152 */
1145#define FLAGS_SHARED 1 1153#define FLAGS_SHARED 0x01
1154#define FLAGS_CLOCKRT 0x02
1146 1155
1147static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1148 1157
1149static int futex_wait(u32 __user *uaddr, int fshared, 1158static int futex_wait(u32 __user *uaddr, int fshared,
1150 u32 val, ktime_t *abs_time, u32 bitset) 1159 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1151{ 1160{
1152 struct task_struct *curr = current; 1161 struct task_struct *curr = current;
1153 DECLARE_WAITQUEUE(wait, curr); 1162 DECLARE_WAITQUEUE(wait, curr);
@@ -1220,7 +1229,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1220 1229
1221 /* add_wait_queue is the barrier after __set_current_state. */ 1230 /* add_wait_queue is the barrier after __set_current_state. */
1222 __set_current_state(TASK_INTERRUPTIBLE); 1231 __set_current_state(TASK_INTERRUPTIBLE);
1223 add_wait_queue(&q.waiters, &wait); 1232 add_wait_queue(&q.waiter, &wait);
1224 /* 1233 /*
1225 * !plist_node_empty() is safe here without any lock. 1234 * !plist_node_empty() is safe here without any lock.
1226 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1235 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1233,8 +1242,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1233 slack = current->timer_slack_ns; 1242 slack = current->timer_slack_ns;
1234 if (rt_task(current)) 1243 if (rt_task(current))
1235 slack = 0; 1244 slack = 0;
1236 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1245 hrtimer_init_on_stack(&t.timer,
1237 HRTIMER_MODE_ABS); 1246 clockrt ? CLOCK_REALTIME :
1247 CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1238 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1239 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1250 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1240 1251
@@ -1289,6 +1300,8 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1289 1300
1290 if (fshared) 1301 if (fshared)
1291 restart->futex.flags |= FLAGS_SHARED; 1302 restart->futex.flags |= FLAGS_SHARED;
1303 if (clockrt)
1304 restart->futex.flags |= FLAGS_CLOCKRT;
1292 return -ERESTART_RESTARTBLOCK; 1305 return -ERESTART_RESTARTBLOCK;
1293 } 1306 }
1294 1307
@@ -1312,7 +1325,8 @@ static long futex_wait_restart(struct restart_block *restart)
1312 if (restart->futex.flags & FLAGS_SHARED) 1325 if (restart->futex.flags & FLAGS_SHARED)
1313 fshared = 1; 1326 fshared = 1;
1314 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1327 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1315 restart->futex.bitset); 1328 restart->futex.bitset,
1329 restart->futex.flags & FLAGS_CLOCKRT);
1316} 1330}
1317 1331
1318 1332
@@ -1558,12 +1572,11 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1558 1572
1559 uaddr_faulted: 1573 uaddr_faulted:
1560 /* 1574 /*
1561 * We have to r/w *(int __user *)uaddr, but we can't modify it 1575 * We have to r/w *(int __user *)uaddr, and we have to modify it
1562 * non-atomically. Therefore, if get_user below is not 1576 * atomically. Therefore, if we continue to fault after get_user()
1563 * enough, we need to handle the fault ourselves, while 1577 * below, we need to handle the fault ourselves, while still holding
1564 * still holding the mmap_sem. 1578 * the mmap_sem. This can occur if the uaddr is under contention as
1565 * 1579 * we have to drop the mmap_sem in order to call get_user().
1566 * ... and hb->lock. :-) --ANK
1567 */ 1580 */
1568 queue_unlock(&q, hb); 1581 queue_unlock(&q, hb);
1569 1582
@@ -1575,7 +1588,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1575 } 1588 }
1576 1589
1577 ret = get_user(uval, uaddr); 1590 ret = get_user(uval, uaddr);
1578 if (!ret && (uval != -EFAULT)) 1591 if (!ret)
1579 goto retry; 1592 goto retry;
1580 1593
1581 if (to) 1594 if (to)
@@ -1669,12 +1682,11 @@ out:
1669 1682
1670pi_faulted: 1683pi_faulted:
1671 /* 1684 /*
1672 * We have to r/w *(int __user *)uaddr, but we can't modify it 1685 * We have to r/w *(int __user *)uaddr, and we have to modify it
1673 * non-atomically. Therefore, if get_user below is not 1686 * atomically. Therefore, if we continue to fault after get_user()
1674 * enough, we need to handle the fault ourselves, while 1687 * below, we need to handle the fault ourselves, while still holding
1675 * still holding the mmap_sem. 1688 * the mmap_sem. This can occur if the uaddr is under contention as
1676 * 1689 * we have to drop the mmap_sem in order to call get_user().
1677 * ... and hb->lock. --ANK
1678 */ 1690 */
1679 spin_unlock(&hb->lock); 1691 spin_unlock(&hb->lock);
1680 1692
@@ -1687,7 +1699,7 @@ pi_faulted:
1687 } 1699 }
1688 1700
1689 ret = get_user(uval, uaddr); 1701 ret = get_user(uval, uaddr);
1690 if (!ret && (uval != -EFAULT)) 1702 if (!ret)
1691 goto retry; 1703 goto retry;
1692 1704
1693 return ret; 1705 return ret;
@@ -1742,6 +1754,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1742{ 1754{
1743 struct robust_list_head __user *head; 1755 struct robust_list_head __user *head;
1744 unsigned long ret; 1756 unsigned long ret;
1757 const struct cred *cred = current_cred(), *pcred;
1745 1758
1746 if (!futex_cmpxchg_enabled) 1759 if (!futex_cmpxchg_enabled)
1747 return -ENOSYS; 1760 return -ENOSYS;
@@ -1757,8 +1770,10 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1757 if (!p) 1770 if (!p)
1758 goto err_unlock; 1771 goto err_unlock;
1759 ret = -EPERM; 1772 ret = -EPERM;
1760 if ((current->euid != p->euid) && (current->euid != p->uid) && 1773 pcred = __task_cred(p);
1761 !capable(CAP_SYS_PTRACE)) 1774 if (cred->euid != pcred->euid &&
1775 cred->euid != pcred->uid &&
1776 !capable(CAP_SYS_PTRACE))
1762 goto err_unlock; 1777 goto err_unlock;
1763 head = p->robust_list; 1778 head = p->robust_list;
1764 rcu_read_unlock(); 1779 rcu_read_unlock();
@@ -1905,18 +1920,22 @@ void exit_robust_list(struct task_struct *curr)
1905long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1920long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1906 u32 __user *uaddr2, u32 val2, u32 val3) 1921 u32 __user *uaddr2, u32 val2, u32 val3)
1907{ 1922{
1908 int ret = -ENOSYS; 1923 int clockrt, ret = -ENOSYS;
1909 int cmd = op & FUTEX_CMD_MASK; 1924 int cmd = op & FUTEX_CMD_MASK;
1910 int fshared = 0; 1925 int fshared = 0;
1911 1926
1912 if (!(op & FUTEX_PRIVATE_FLAG)) 1927 if (!(op & FUTEX_PRIVATE_FLAG))
1913 fshared = 1; 1928 fshared = 1;
1914 1929
1930 clockrt = op & FUTEX_CLOCK_REALTIME;
1931 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1932 return -ENOSYS;
1933
1915 switch (cmd) { 1934 switch (cmd) {
1916 case FUTEX_WAIT: 1935 case FUTEX_WAIT:
1917 val3 = FUTEX_BITSET_MATCH_ANY; 1936 val3 = FUTEX_BITSET_MATCH_ANY;
1918 case FUTEX_WAIT_BITSET: 1937 case FUTEX_WAIT_BITSET:
1919 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1938 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
1920 break; 1939 break;
1921 case FUTEX_WAKE: 1940 case FUTEX_WAKE:
1922 val3 = FUTEX_BITSET_MATCH_ANY; 1941 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42cf..d607a5b9ee29 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
135{ 135{
136 struct compat_robust_list_head __user *head; 136 struct compat_robust_list_head __user *head;
137 unsigned long ret; 137 unsigned long ret;
138 const struct cred *cred = current_cred(), *pcred;
138 139
139 if (!futex_cmpxchg_enabled) 140 if (!futex_cmpxchg_enabled)
140 return -ENOSYS; 141 return -ENOSYS;
@@ -150,8 +151,10 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
150 if (!p) 151 if (!p)
151 goto err_unlock; 152 goto err_unlock;
152 ret = -EPERM; 153 ret = -EPERM;
153 if ((current->euid != p->euid) && (current->euid != p->uid) && 154 pcred = __task_cred(p);
154 !capable(CAP_SYS_PTRACE)) 155 if (cred->euid != pcred->euid &&
156 cred->euid != pcred->uid &&
157 !capable(CAP_SYS_PTRACE))
155 goto err_unlock; 158 goto err_unlock;
156 head = p->compat_robust_list; 159 head = p->compat_robust_list;
157 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b2..bda9cb924276 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 443#endif
444 444
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 445/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 446#ifdef CONFIG_HIGH_RES_TIMERS
463 447
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 635{
652} 636}
653 637
638static void __run_hrtimer(struct hrtimer *timer);
639
654/* 640/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 641 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 642 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 647 struct hrtimer_clock_base *base)
662{ 648{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 649 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 650 /*
665 /* Timer is expired, act upon the callback mode */ 651 * XXX: recursion check?
666 switch(timer->cb_mode) { 652 * hrtimer_forward() should round up with timer granularity
667 case HRTIMER_CB_IRQSAFE_PERCPU: 653 * so that we never get into inf recursion here,
668 case HRTIMER_CB_IRQSAFE_UNLOCKED: 654 * it doesn't do that though
669 /* 655 */
670 * This is solely for the sched tick emulation with 656 __run_hrtimer(timer);
671 * dynamic tick support to ensure that we do not 657 return 1;
672 * restart the tick right on the edge and end up with
673 * the tick timer in the softirq ! The calling site
674 * takes care of this. Also used for hrtimer sleeper !
675 */
676 debug_hrtimer_deactivate(timer);
677 return 1;
678 case HRTIMER_CB_SOFTIRQ:
679 /*
680 * Move everything else into the softirq pending list !
681 */
682 list_add_tail(&timer->cb_entry,
683 &base->cpu_base->cb_pending);
684 timer->state = HRTIMER_STATE_PENDING;
685 return 1;
686 default:
687 BUG();
688 }
689 } 658 }
690 return 0; 659 return 0;
691} 660}
@@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
724 return 1; 693 return 1;
725} 694}
726 695
727static inline void hrtimer_raise_softirq(void)
728{
729 raise_softirq(HRTIMER_SOFTIRQ);
730}
731
732#else 696#else
733 697
734static inline int hrtimer_hres_active(void) { return 0; } 698static inline int hrtimer_hres_active(void) { return 0; }
@@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
747{ 711{
748 return 0; 712 return 0;
749} 713}
750static inline void hrtimer_raise_softirq(void) { }
751 714
752#endif /* CONFIG_HIGH_RES_TIMERS */ 715#endif /* CONFIG_HIGH_RES_TIMERS */
753 716
@@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
890 struct hrtimer_clock_base *base, 853 struct hrtimer_clock_base *base,
891 unsigned long newstate, int reprogram) 854 unsigned long newstate, int reprogram)
892{ 855{
893 /* High res. callback list. NOP for !HIGHRES */ 856 if (timer->state & HRTIMER_STATE_ENQUEUED) {
894 if (hrtimer_cb_pending(timer))
895 hrtimer_remove_cb_pending(timer);
896 else {
897 /* 857 /*
898 * Remove the timer from the rbtree and replace the 858 * Remove the timer from the rbtree and replace the
899 * first entry pointer if necessary. 859 * first entry pointer if necessary.
@@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
953{ 913{
954 struct hrtimer_clock_base *base, *new_base; 914 struct hrtimer_clock_base *base, *new_base;
955 unsigned long flags; 915 unsigned long flags;
956 int ret, raise; 916 int ret;
957 917
958 base = lock_hrtimer_base(timer, &flags); 918 base = lock_hrtimer_base(timer, &flags);
959 919
@@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
988 enqueue_hrtimer(timer, new_base, 948 enqueue_hrtimer(timer, new_base,
989 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 949 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
990 950
991 /*
992 * The timer may be expired and moved to the cb_pending
993 * list. We can not raise the softirq with base lock held due
994 * to a possible deadlock with runqueue lock.
995 */
996 raise = timer->state == HRTIMER_STATE_PENDING;
997
998 /*
999 * We use preempt_disable to prevent this task from migrating after
1000 * setting up the softirq and raising it. Otherwise, if me migrate
1001 * we will raise the softirq on the wrong CPU.
1002 */
1003 preempt_disable();
1004
1005 unlock_hrtimer_base(timer, &flags); 951 unlock_hrtimer_base(timer, &flags);
1006 952
1007 if (raise)
1008 hrtimer_raise_softirq();
1009 preempt_enable();
1010
1011 return ret; 953 return ret;
1012} 954}
1013EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 955EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1192} 1134}
1193EXPORT_SYMBOL_GPL(hrtimer_get_res); 1135EXPORT_SYMBOL_GPL(hrtimer_get_res);
1194 1136
1195static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1196{
1197 spin_lock_irq(&cpu_base->lock);
1198
1199 while (!list_empty(&cpu_base->cb_pending)) {
1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1201 struct hrtimer *timer;
1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1204
1205 timer = list_entry(cpu_base->cb_pending.next,
1206 struct hrtimer, cb_entry);
1207
1208 debug_hrtimer_deactivate(timer);
1209 timer_stats_account_hrtimer(timer);
1210
1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1222 spin_unlock_irq(&cpu_base->lock);
1223
1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1230
1231 spin_lock_irq(&cpu_base->lock);
1232
1233 timer->state &= ~HRTIMER_STATE_CALLBACK;
1234 if (restart == HRTIMER_RESTART) {
1235 BUG_ON(hrtimer_active(timer));
1236 /*
1237 * Enqueue the timer, allow reprogramming of the event
1238 * device
1239 */
1240 enqueue_hrtimer(timer, timer->base, 1);
1241 } else if (hrtimer_active(timer)) {
1242 /*
1243 * If the timer was rearmed on another CPU, reprogram
1244 * the event device.
1245 */
1246 struct hrtimer_clock_base *base = timer->base;
1247
1248 if (base->first == &timer->node &&
1249 hrtimer_reprogram(timer, base)) {
1250 /*
1251 * Timer is expired. Thus move it from tree to
1252 * pending list again.
1253 */
1254 __remove_hrtimer(timer, base,
1255 HRTIMER_STATE_PENDING, 0);
1256 list_add_tail(&timer->cb_entry,
1257 &base->cpu_base->cb_pending);
1258 }
1259 }
1260 }
1261 spin_unlock_irq(&cpu_base->lock);
1262}
1263
1264static void __run_hrtimer(struct hrtimer *timer) 1137static void __run_hrtimer(struct hrtimer *timer)
1265{ 1138{
1266 struct hrtimer_clock_base *base = timer->base; 1139 struct hrtimer_clock_base *base = timer->base;
@@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
1268 enum hrtimer_restart (*fn)(struct hrtimer *); 1141 enum hrtimer_restart (*fn)(struct hrtimer *);
1269 int restart; 1142 int restart;
1270 1143
1144 WARN_ON(!irqs_disabled());
1145
1271 debug_hrtimer_deactivate(timer); 1146 debug_hrtimer_deactivate(timer);
1272 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1147 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1273 timer_stats_account_hrtimer(timer); 1148 timer_stats_account_hrtimer(timer);
1274
1275 fn = timer->function; 1149 fn = timer->function;
1276 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || 1150
1277 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { 1151 /*
1278 /* 1152 * Because we run timers from hardirq context, there is no chance
1279 * Used for scheduler timers, avoid lock inversion with 1153 * they get migrated to another cpu, therefore its safe to unlock
1280 * rq->lock and tasklist_lock. 1154 * the timer base.
1281 * 1155 */
1282 * These timers are required to deal with enqueue expiry 1156 spin_unlock(&cpu_base->lock);
1283 * themselves and are not allowed to migrate. 1157 restart = fn(timer);
1284 */ 1158 spin_lock(&cpu_base->lock);
1285 spin_unlock(&cpu_base->lock);
1286 restart = fn(timer);
1287 spin_lock(&cpu_base->lock);
1288 } else
1289 restart = fn(timer);
1290 1159
1291 /* 1160 /*
1292 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1161 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1311 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1180 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1312 struct hrtimer_clock_base *base; 1181 struct hrtimer_clock_base *base;
1313 ktime_t expires_next, now; 1182 ktime_t expires_next, now;
1314 int i, raise = 0; 1183 int i;
1315 1184
1316 BUG_ON(!cpu_base->hres_active); 1185 BUG_ON(!cpu_base->hres_active);
1317 cpu_base->nr_events++; 1186 cpu_base->nr_events++;
@@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1360 break; 1229 break;
1361 } 1230 }
1362 1231
1363 /* Move softirq callbacks to the pending list */
1364 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1365 __remove_hrtimer(timer, base,
1366 HRTIMER_STATE_PENDING, 0);
1367 list_add_tail(&timer->cb_entry,
1368 &base->cpu_base->cb_pending);
1369 raise = 1;
1370 continue;
1371 }
1372
1373 __run_hrtimer(timer); 1232 __run_hrtimer(timer);
1374 } 1233 }
1375 spin_unlock(&cpu_base->lock); 1234 spin_unlock(&cpu_base->lock);
@@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1383 if (tick_program_event(expires_next, 0)) 1242 if (tick_program_event(expires_next, 0))
1384 goto retry; 1243 goto retry;
1385 } 1244 }
1386
1387 /* Raise softirq ? */
1388 if (raise)
1389 raise_softirq(HRTIMER_SOFTIRQ);
1390} 1245}
1391 1246
1392/** 1247/**
@@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void)
1413 local_irq_restore(flags); 1268 local_irq_restore(flags);
1414} 1269}
1415 1270
1416static void run_hrtimer_softirq(struct softirq_action *h)
1417{
1418 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1419}
1420
1421#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#endif /* CONFIG_HIGH_RES_TIMERS */
1422 1272
1423/* 1273/*
@@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1429 */ 1279 */
1430void hrtimer_run_pending(void) 1280void hrtimer_run_pending(void)
1431{ 1281{
1432 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1433
1434 if (hrtimer_hres_active()) 1282 if (hrtimer_hres_active())
1435 return; 1283 return;
1436 1284
@@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void)
1444 */ 1292 */
1445 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1293 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1446 hrtimer_switch_to_hres(); 1294 hrtimer_switch_to_hres();
1447
1448 run_hrtimer_pending(cpu_base);
1449} 1295}
1450 1296
1451/* 1297/*
@@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void)
1482 hrtimer_get_expires_tv64(timer)) 1328 hrtimer_get_expires_tv64(timer))
1483 break; 1329 break;
1484 1330
1485 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1486 __remove_hrtimer(timer, base,
1487 HRTIMER_STATE_PENDING, 0);
1488 list_add_tail(&timer->cb_entry,
1489 &base->cpu_base->cb_pending);
1490 continue;
1491 }
1492
1493 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1494 } 1332 }
1495 spin_unlock(&cpu_base->lock); 1333 spin_unlock(&cpu_base->lock);
@@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1516{ 1354{
1517 sl->timer.function = hrtimer_wakeup; 1355 sl->timer.function = hrtimer_wakeup;
1518 sl->task = task; 1356 sl->task = task;
1519#ifdef CONFIG_HIGH_RES_TIMERS
1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1521#endif
1522} 1357}
1523 1358
1524static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1359static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,18 +1490,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1490 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1656 cpu_base->clock_base[i].cpu_base = cpu_base; 1491 cpu_base->clock_base[i].cpu_base = cpu_base;
1657 1492
1658 INIT_LIST_HEAD(&cpu_base->cb_pending);
1659 hrtimer_init_hres(cpu_base); 1493 hrtimer_init_hres(cpu_base);
1660} 1494}
1661 1495
1662#ifdef CONFIG_HOTPLUG_CPU 1496#ifdef CONFIG_HOTPLUG_CPU
1663 1497
1664static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1498static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1665 struct hrtimer_clock_base *new_base, int dcpu) 1499 struct hrtimer_clock_base *new_base)
1666{ 1500{
1667 struct hrtimer *timer; 1501 struct hrtimer *timer;
1668 struct rb_node *node; 1502 struct rb_node *node;
1669 int raise = 0;
1670 1503
1671 while ((node = rb_first(&old_base->active))) { 1504 while ((node = rb_first(&old_base->active))) {
1672 timer = rb_entry(node, struct hrtimer, node); 1505 timer = rb_entry(node, struct hrtimer, node);
@@ -1674,18 +1507,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1674 debug_hrtimer_deactivate(timer); 1507 debug_hrtimer_deactivate(timer);
1675 1508
1676 /* 1509 /*
1677 * Should not happen. Per CPU timers should be
1678 * canceled _before_ the migration code is called
1679 */
1680 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1681 __remove_hrtimer(timer, old_base,
1682 HRTIMER_STATE_INACTIVE, 0);
1683 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1684 timer, timer->function, dcpu);
1685 continue;
1686 }
1687
1688 /*
1689 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1510 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1690 * timer could be seen as !active and just vanish away 1511 * timer could be seen as !active and just vanish away
1691 * under us on another CPU 1512 * under us on another CPU
@@ -1693,69 +1514,34 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1693 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1514 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1694 timer->base = new_base; 1515 timer->base = new_base;
1695 /* 1516 /*
1696 * Enqueue the timer. Allow reprogramming of the event device 1517 * Enqueue the timers on the new cpu, but do not reprogram
1518 * the timer as that would enable a deadlock between
1519 * hrtimer_enqueue_reprogramm() running the timer and us still
1520 * holding a nested base lock.
1521 *
1522 * Instead we tickle the hrtimer interrupt after the migration
1523 * is done, which will run all expired timers and re-programm
1524 * the timer device.
1697 */ 1525 */
1698 enqueue_hrtimer(timer, new_base, 1); 1526 enqueue_hrtimer(timer, new_base, 0);
1699 1527
1700#ifdef CONFIG_HIGH_RES_TIMERS
1701 /*
1702 * Happens with high res enabled when the timer was
1703 * already expired and the callback mode is
1704 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1705 * enqueue code does not move them to the soft irq
1706 * pending list for performance/latency reasons, but
1707 * in the migration state, we need to do that
1708 * otherwise we end up with a stale timer.
1709 */
1710 if (timer->state == HRTIMER_STATE_MIGRATE) {
1711 timer->state = HRTIMER_STATE_PENDING;
1712 list_add_tail(&timer->cb_entry,
1713 &new_base->cpu_base->cb_pending);
1714 raise = 1;
1715 }
1716#endif
1717 /* Clear the migration state bit */ 1528 /* Clear the migration state bit */
1718 timer->state &= ~HRTIMER_STATE_MIGRATE; 1529 timer->state &= ~HRTIMER_STATE_MIGRATE;
1719 } 1530 }
1720 return raise;
1721}
1722
1723#ifdef CONFIG_HIGH_RES_TIMERS
1724static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1725 struct hrtimer_cpu_base *new_base)
1726{
1727 struct hrtimer *timer;
1728 int raise = 0;
1729
1730 while (!list_empty(&old_base->cb_pending)) {
1731 timer = list_entry(old_base->cb_pending.next,
1732 struct hrtimer, cb_entry);
1733
1734 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1735 timer->base = &new_base->clock_base[timer->base->index];
1736 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1737 raise = 1;
1738 }
1739 return raise;
1740}
1741#else
1742static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1743 struct hrtimer_cpu_base *new_base)
1744{
1745 return 0;
1746} 1531}
1747#endif
1748 1532
1749static void migrate_hrtimers(int cpu) 1533static int migrate_hrtimers(int scpu)
1750{ 1534{
1751 struct hrtimer_cpu_base *old_base, *new_base; 1535 struct hrtimer_cpu_base *old_base, *new_base;
1752 int i, raise = 0; 1536 int dcpu, i;
1753 1537
1754 BUG_ON(cpu_online(cpu)); 1538 BUG_ON(cpu_online(scpu));
1755 old_base = &per_cpu(hrtimer_bases, cpu); 1539 old_base = &per_cpu(hrtimer_bases, scpu);
1756 new_base = &get_cpu_var(hrtimer_bases); 1540 new_base = &get_cpu_var(hrtimer_bases);
1757 1541
1758 tick_cancel_sched_timer(cpu); 1542 dcpu = smp_processor_id();
1543
1544 tick_cancel_sched_timer(scpu);
1759 /* 1545 /*
1760 * The caller is globally serialized and nobody else 1546 * The caller is globally serialized and nobody else
1761 * takes two locks at once, deadlock is not possible. 1547 * takes two locks at once, deadlock is not possible.
@@ -1764,41 +1550,47 @@ static void migrate_hrtimers(int cpu)
1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1550 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1765 1551
1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1552 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1767 if (migrate_hrtimer_list(&old_base->clock_base[i], 1553 migrate_hrtimer_list(&old_base->clock_base[i],
1768 &new_base->clock_base[i], cpu)) 1554 &new_base->clock_base[i]);
1769 raise = 1;
1770 } 1555 }
1771 1556
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1775 spin_unlock(&old_base->lock); 1557 spin_unlock(&old_base->lock);
1776 spin_unlock_irq(&new_base->lock); 1558 spin_unlock_irq(&new_base->lock);
1777 put_cpu_var(hrtimer_bases); 1559 put_cpu_var(hrtimer_bases);
1778 1560
1779 if (raise) 1561 return dcpu;
1780 hrtimer_raise_softirq(); 1562}
1563
1564static void tickle_timers(void *arg)
1565{
1566 hrtimer_peek_ahead_timers();
1781} 1567}
1568
1782#endif /* CONFIG_HOTPLUG_CPU */ 1569#endif /* CONFIG_HOTPLUG_CPU */
1783 1570
1784static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1571static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1785 unsigned long action, void *hcpu) 1572 unsigned long action, void *hcpu)
1786{ 1573{
1787 unsigned int cpu = (long)hcpu; 1574 int scpu = (long)hcpu;
1788 1575
1789 switch (action) { 1576 switch (action) {
1790 1577
1791 case CPU_UP_PREPARE: 1578 case CPU_UP_PREPARE:
1792 case CPU_UP_PREPARE_FROZEN: 1579 case CPU_UP_PREPARE_FROZEN:
1793 init_hrtimers_cpu(cpu); 1580 init_hrtimers_cpu(scpu);
1794 break; 1581 break;
1795 1582
1796#ifdef CONFIG_HOTPLUG_CPU 1583#ifdef CONFIG_HOTPLUG_CPU
1797 case CPU_DEAD: 1584 case CPU_DEAD:
1798 case CPU_DEAD_FROZEN: 1585 case CPU_DEAD_FROZEN:
1799 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1586 {
1800 migrate_hrtimers(cpu); 1587 int dcpu;
1588
1589 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1590 dcpu = migrate_hrtimers(scpu);
1591 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1801 break; 1592 break;
1593 }
1802#endif 1594#endif
1803 1595
1804 default: 1596 default:
@@ -1817,9 +1609,6 @@ void __init hrtimers_init(void)
1817 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1609 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1818 (void *)(long)smp_processor_id()); 1610 (void *)(long)smp_processor_id());
1819 register_cpu_notifier(&hrtimers_nb); 1611 register_cpu_notifier(&hrtimers_nb);
1820#ifdef CONFIG_HIGH_RES_TIMERS
1821 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1822#endif
1823} 1612}
1824 1613
1825/** 1614/**
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b343deedae91..f63c706d25e1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -125,6 +125,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
125 return -ENODEV; 125 return -ENODEV;
126 } 126 }
127 127
128 type &= IRQ_TYPE_SENSE_MASK;
128 if (type == IRQ_TYPE_NONE) 129 if (type == IRQ_TYPE_NONE)
129 return 0; 130 return 0;
130 131
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f1a23069c20a..6492400cb50d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -422,11 +422,8 @@ out:
422} 422}
423#endif 423#endif
424 424
425
426#ifdef CONFIG_TRACE_IRQFLAGS
427void early_init_irq_lock_class(void) 425void early_init_irq_lock_class(void)
428{ 426{
429#ifndef CONFIG_SPARSE_IRQ
430 struct irq_desc *desc; 427 struct irq_desc *desc;
431 int i; 428 int i;
432 429
@@ -436,9 +433,7 @@ void early_init_irq_lock_class(void)
436 433
437 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 434 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
438 } 435 }
439#endif
440} 436}
441#endif
442 437
443#ifdef CONFIG_SPARSE_IRQ 438#ifdef CONFIG_SPARSE_IRQ
444unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 10ad2f87ed9a..cd0cd8dcb345 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19cpumask_var_t irq_default_affinity;
19 20
20cpumask_t irq_default_affinity = CPU_MASK_ALL; 21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
21 28
22/** 29/**
23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
127 desc->status &= ~IRQ_AFFINITY_SET; 134 desc->status &= ~IRQ_AFFINITY_SET;
128 } 135 }
129 136
130 cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); 137 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
131set_affinity: 138set_affinity:
132 desc->chip->set_affinity(irq, &desc->affinity); 139 desc->chip->set_affinity(irq, &desc->affinity);
133 140
@@ -368,16 +375,18 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
368 return 0; 375 return 0;
369 } 376 }
370 377
371 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); 378 /* caller masked out all except trigger mode flags */
379 ret = chip->set_type(irq, flags);
372 380
373 if (ret) 381 if (ret)
374 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 382 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
375 (int)(flags & IRQF_TRIGGER_MASK), 383 (int)flags, irq, chip->set_type);
376 irq, chip->set_type);
377 else { 384 else {
385 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
386 flags |= IRQ_LEVEL;
378 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 387 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
379 desc->status &= ~IRQ_TYPE_SENSE_MASK; 388 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
380 desc->status |= flags & IRQ_TYPE_SENSE_MASK; 389 desc->status |= flags;
381 } 390 }
382 391
383 return ret; 392 return ret;
@@ -457,7 +466,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
457 466
458 /* Setup the type (level, edge polarity) if configured: */ 467 /* Setup the type (level, edge polarity) if configured: */
459 if (new->flags & IRQF_TRIGGER_MASK) { 468 if (new->flags & IRQF_TRIGGER_MASK) {
460 ret = __irq_set_trigger(desc, irq, new->flags); 469 ret = __irq_set_trigger(desc, irq,
470 new->flags & IRQF_TRIGGER_MASK);
461 471
462 if (ret) { 472 if (ret) {
463 spin_unlock_irqrestore(&desc->lock, flags); 473 spin_unlock_irqrestore(&desc->lock, flags);
@@ -671,6 +681,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
671 struct irq_desc *desc; 681 struct irq_desc *desc;
672 int retval; 682 int retval;
673 683
684 /*
685 * handle_IRQ_event() always ignores IRQF_DISABLED except for
686 * the _first_ irqaction (sigh). That can cause oopsing, but
687 * the behavior is classified as "will not fix" so we need to
688 * start nudging drivers away from using that idiom.
689 */
690 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
691 == (IRQF_SHARED|IRQF_DISABLED))
692 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
693 "guaranteed on shared IRQs\n",
694 irq, devname);
695
674#ifdef CONFIG_LOCKDEP 696#ifdef CONFIG_LOCKDEP
675 /* 697 /*
676 * Lockdep wants atomic interrupt handlers: 698 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 0178e2296990..089c3746358a 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -1,13 +1,8 @@
1/* 1/*
2 * linux/kernel/irq/handle.c 2 * NUMA irq-desc migration code
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 * 3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
11 */ 6 */
12 7
13#include <linux/irq.h> 8#include <linux/irq.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d2c0e5ee53c5..2abd3a7716ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 const struct cpumask *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
@@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
93 93
94static int default_affinity_show(struct seq_file *m, void *v) 94static int default_affinity_show(struct seq_file *m, void *v)
95{ 95{
96 seq_cpumask(m, &irq_default_affinity); 96 seq_cpumask(m, irq_default_affinity);
97 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
98 return 0; 98 return 0;
99} 99}
@@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
101static ssize_t default_affinity_write(struct file *file, 101static ssize_t default_affinity_write(struct file *file,
102 const char __user *buffer, size_t count, loff_t *ppos) 102 const char __user *buffer, size_t count, loff_t *ppos)
103{ 103{
104 cpumask_t new_value; 104 cpumask_var_t new_value;
105 int err; 105 int err;
106 106
107 err = cpumask_parse_user(buffer, count, &new_value); 107 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
108 return -ENOMEM;
109
110 err = cpumask_parse_user(buffer, count, new_value);
108 if (err) 111 if (err)
109 return err; 112 goto out;
110 113
111 if (!is_affinity_mask_valid(new_value)) 114 if (!is_affinity_mask_valid(new_value)) {
112 return -EINVAL; 115 err = -EINVAL;
116 goto out;
117 }
113 118
114 /* 119 /*
115 * Do not allow disabling IRQs completely - it's a too easy 120 * Do not allow disabling IRQs completely - it's a too easy
116 * way to make the system unusable accidentally :-) At least 121 * way to make the system unusable accidentally :-) At least
117 * one online CPU still has to be targeted. 122 * one online CPU still has to be targeted.
118 */ 123 */
119 if (!cpus_intersects(new_value, cpu_online_map)) 124 if (!cpumask_intersects(new_value, cpu_online_mask)) {
120 return -EINVAL; 125 err = -EINVAL;
126 goto out;
127 }
121 128
122 irq_default_affinity = new_value; 129 cpumask_copy(irq_default_affinity, new_value);
130 err = count;
123 131
124 return count; 132out:
133 free_cpumask_var(new_value);
134 return err;
125} 135}
126 136
127static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..e694afa0eb8c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33extern const unsigned long kallsyms_addresses[];
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const u8 kallsyms_names[];
35extern const u8 kallsyms_names[] __attribute__((weak));
36 35
37/* tell the compiler that the count isn't in the small data section if the arch 36/* tell the compiler that the count isn't in the small data section if the arch
38 * has one (eg: FRV) 37 * has one (eg: FRV)
39 */ 38 */
40extern const unsigned long kallsyms_num_syms 39extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 40 __attribute__((__section__(".rodata")));
42 41
43extern const u8 kallsyms_token_table[] __attribute__((weak)); 42extern const u8 kallsyms_token_table[];
44extern const u16 kallsyms_token_index[] __attribute__((weak)); 43extern const u16 kallsyms_token_index[];
45 44
46extern const unsigned long kallsyms_markers[] __attribute__((weak)); 45extern const unsigned long kallsyms_markers[];
47 46
48static inline int is_kernel_inittext(unsigned long addr) 47static inline int is_kernel_inittext(unsigned long addr)
49{ 48{
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
168 unsigned long symbol_start = 0, symbol_end = 0; 167 unsigned long symbol_start = 0, symbol_end = 0;
169 unsigned long i, low, high, mid; 168 unsigned long i, low, high, mid;
170 169
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
174 /* do a binary search on the sorted kallsyms_addresses array */ 170 /* do a binary search on the sorted kallsyms_addresses array */
175 low = 0; 171 low = 0;
176 high = kallsyms_num_syms; 172 high = kallsyms_num_syms;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d0..3fb855ad6aa0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1116 struct elf_prstatus prstatus; 1116 struct elf_prstatus prstatus;
1117 u32 *buf; 1117 u32 *buf;
1118 1118
1119 if ((cpu < 0) || (cpu >= NR_CPUS)) 1119 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1120 return; 1120 return;
1121 1121
1122 /* Using ELF notes here is opportunistic. 1122 /* Using ELF notes here is opportunistic.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a023..b46dbb908669 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
120 struct completion *complete; 120 struct completion *complete;
121 struct cred *cred;
121 char *path; 122 char *path;
122 char **argv; 123 char **argv;
123 char **envp; 124 char **envp;
124 struct key *ring;
125 enum umh_wait wait; 125 enum umh_wait wait;
126 int retval; 126 int retval;
127 struct file *stdin; 127 struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
134static int ____call_usermodehelper(void *data) 134static int ____call_usermodehelper(void *data)
135{ 135{
136 struct subprocess_info *sub_info = data; 136 struct subprocess_info *sub_info = data;
137 struct key *new_session, *old_session;
138 int retval; 137 int retval;
139 138
140 /* Unblock all signals and set the session keyring. */ 139 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
141 new_session = key_get(sub_info->ring); 140
141 /* Unblock all signals */
142 spin_lock_irq(&current->sighand->siglock); 142 spin_lock_irq(&current->sighand->siglock);
143 old_session = __install_session_keyring(current, new_session);
144 flush_signal_handlers(current, 1); 143 flush_signal_handlers(current, 1);
145 sigemptyset(&current->blocked); 144 sigemptyset(&current->blocked);
146 recalc_sigpending(); 145 recalc_sigpending();
147 spin_unlock_irq(&current->sighand->siglock); 146 spin_unlock_irq(&current->sighand->siglock);
148 147
149 key_put(old_session); 148 /* Install the credentials */
149 commit_creds(sub_info->cred);
150 sub_info->cred = NULL;
150 151
151 /* Install input pipe when needed */ 152 /* Install input pipe when needed */
152 if (sub_info->stdin) { 153 if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{ 186{
186 if (info->cleanup) 187 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp); 188 (*info->cleanup)(info->argv, info->envp);
189 if (info->cred)
190 put_cred(info->cred);
188 kfree(info); 191 kfree(info);
189} 192}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo); 193EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
240 pid_t pid; 243 pid_t pid;
241 enum umh_wait wait = sub_info->wait; 244 enum umh_wait wait = sub_info->wait;
242 245
246 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
247
243 /* CLONE_VFORK: wait until the usermode helper has execve'd 248 /* CLONE_VFORK: wait until the usermode helper has execve'd
244 * successfully We need the data structures to stay around 249 * successfully We need the data structures to stay around
245 * until that is done. */ 250 * until that is done. */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
362 sub_info->path = path; 367 sub_info->path = path;
363 sub_info->argv = argv; 368 sub_info->argv = argv;
364 sub_info->envp = envp; 369 sub_info->envp = envp;
370 sub_info->cred = prepare_usermodehelper_creds();
371 if (!sub_info->cred)
372 return NULL;
365 373
366 out: 374 out:
367 return sub_info; 375 return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
376void call_usermodehelper_setkeys(struct subprocess_info *info, 384void call_usermodehelper_setkeys(struct subprocess_info *info,
377 struct key *session_keyring) 385 struct key *session_keyring)
378{ 386{
379 info->ring = session_keyring; 387#ifdef CONFIG_KEYS
388 struct thread_group_cred *tgcred = info->cred->tgcred;
389 key_put(tgcred->session_keyring);
390 tgcred->session_keyring = key_get(session_keyring);
391#else
392 BUG();
393#endif
380} 394}
381EXPORT_SYMBOL(call_usermodehelper_setkeys); 395EXPORT_SYMBOL(call_usermodehelper_setkeys);
382 396
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
444 DECLARE_COMPLETION_ONSTACK(done); 458 DECLARE_COMPLETION_ONSTACK(done);
445 int retval = 0; 459 int retval = 0;
446 460
461 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
462
447 helper_lock(); 463 helper_lock();
448 if (sub_info->path[0] == '\0') 464 if (sub_info->path[0] == '\0')
449 goto out; 465 goto out;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c4c7df23f8c7..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -292,14 +292,12 @@ void lockdep_off(void)
292{ 292{
293 current->lockdep_recursion++; 293 current->lockdep_recursion++;
294} 294}
295
296EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
297 296
298void lockdep_on(void) 297void lockdep_on(void)
299{ 298{
300 current->lockdep_recursion--; 299 current->lockdep_recursion--;
301} 300}
302
303EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
304 302
305/* 303/*
@@ -581,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
581/* 579/*
582 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
583 */ 581 */
584static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
585{ 584{
586 struct lock_list *entry; 585 struct lock_list *entry;
587 586
@@ -2513,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2513 if (subclass) 2512 if (subclass)
2514 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2515} 2514}
2516
2517EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2518 2516
2519/* 2517/*
@@ -2694,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2694} 2692}
2695 2693
2696static int 2694static int
2697__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2698 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2699{ 2698{
2700 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2701 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2722,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2722 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2723 2722
2724found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2725 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2726 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2727 2727
@@ -2906,9 +2906,9 @@ static void check_flags(unsigned long flags)
2906#endif 2906#endif
2907} 2907}
2908 2908
2909void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2910lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2911 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2912{ 2912{
2913 unsigned long flags; 2913 unsigned long flags;
2914 2914
@@ -2918,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2918 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2919 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2920 check_flags(flags); 2920 check_flags(flags);
2921 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2922 check_chain_key(current); 2922 check_chain_key(current);
2923 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2924 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2925} 2925}
2926 2926EXPORT_SYMBOL_GPL(lock_set_class);
2927EXPORT_SYMBOL_GPL(lock_set_subclass);
2928 2927
2929/* 2928/*
2930 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2948,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2948 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2949 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2950} 2949}
2951
2952EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2953 2951
2954void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2966,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2966 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2967 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2968} 2966}
2969
2970EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2971 2968
2972#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -3451,7 +3448,6 @@ retry:
3451 if (unlock) 3448 if (unlock)
3452 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3453} 3450}
3454
3455EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3456 3452
3457/* 3453/*
@@ -3472,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3472{ 3468{
3473 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3474} 3470}
3475
3476EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3477 3472
3478void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a2583..63598dca2d0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
80 goto out_pid; 80 goto out_pid;
81 } 81 }
82 82
83 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
84 if (IS_ERR(new_nsp->user_ns)) {
85 err = PTR_ERR(new_nsp->user_ns);
86 goto out_user;
87 }
88
89 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 83 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
90 if (IS_ERR(new_nsp->net_ns)) { 84 if (IS_ERR(new_nsp->net_ns)) {
91 err = PTR_ERR(new_nsp->net_ns); 85 err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
95 return new_nsp; 89 return new_nsp;
96 90
97out_net: 91out_net:
98 if (new_nsp->user_ns)
99 put_user_ns(new_nsp->user_ns);
100out_user:
101 if (new_nsp->pid_ns) 92 if (new_nsp->pid_ns)
102 put_pid_ns(new_nsp->pid_ns); 93 put_pid_ns(new_nsp->pid_ns);
103out_pid: 94out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
130 get_nsproxy(old_ns); 121 get_nsproxy(old_ns);
131 122
132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 123 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) 124 CLONE_NEWPID | CLONE_NEWNET)))
134 return 0; 125 return 0;
135 126
136 if (!capable(CAP_SYS_ADMIN)) { 127 if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
173 put_ipc_ns(ns->ipc_ns); 164 put_ipc_ns(ns->ipc_ns);
174 if (ns->pid_ns) 165 if (ns->pid_ns)
175 put_pid_ns(ns->pid_ns); 166 put_pid_ns(ns->pid_ns);
176 if (ns->user_ns)
177 put_user_ns(ns->user_ns);
178 put_net(ns->net_ns); 167 put_net(ns->net_ns);
179 kmem_cache_free(nsproxy_cachep, ns); 168 kmem_cache_free(nsproxy_cachep, ns);
180} 169}
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
189 int err = 0; 178 int err = 0;
190 179
191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 180 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
192 CLONE_NEWUSER | CLONE_NEWNET))) 181 CLONE_NEWNET)))
193 return 0; 182 return 0;
194 183
195 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..13f06349a786 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -321,36 +322,27 @@ void oops_exit(void)
321} 322}
322 323
323#ifdef WANT_WARN_ON_SLOWPATH 324#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 325void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 326{
343 va_list args; 327 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 328 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 329 unsigned long caller = (unsigned long)__builtin_return_address(0);
330 const char *board;
331
346 sprint_symbol(function, caller); 332 sprint_symbol(function, caller);
347 333
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 334 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 335 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 336 line, function);
351 va_start(args, fmt); 337 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 338 if (board)
353 va_end(args); 339 printk(KERN_WARNING "Hardware name: %s\n", board);
340
341 if (fmt) {
342 va_start(args, fmt);
343 vprintk(fmt, args);
344 va_end(args);
345 }
354 346
355 print_modules(); 347 print_modules();
356 dump_stack(); 348 dump_stack();
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294b..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer)
197 return 0; 197 return 0;
198} 198}
199 199
200static int no_timer_create(struct k_itimer *new_timer)
201{
202 return -EOPNOTSUPP;
203}
204
200/* 205/*
201 * Return nonzero if we know a priori this clockid_t value is bogus. 206 * Return nonzero if we know a priori this clockid_t value is bogus.
202 */ 207 */
@@ -248,6 +253,7 @@ static __init int init_posix_timers(void)
248 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime, 255 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create,
251 }; 257 };
252 258
253 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 259 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -313,7 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
313 319
314int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
315{ 321{
316 int shared, ret; 322 struct task_struct *task;
323 int shared, ret = -1;
317 /* 324 /*
318 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
319 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -327,8 +334,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
327 */ 334 */
328 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
329 336
330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); 337 rcu_read_lock();
331 ret = send_sigqueue(timr->sigq, timr->it_process, shared); 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
339 if (task) {
340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
341 ret = send_sigqueue(timr->sigq, task, shared);
342 }
343 rcu_read_unlock();
332 /* If we failed to send the signal the timer stops. */ 344 /* If we failed to send the signal the timer stops. */
333 return ret > 0; 345 return ret > 0;
334} 346}
@@ -405,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
405 return ret; 417 return ret;
406} 418}
407 419
408static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
409{ 421{
410 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
411 423
@@ -419,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
419 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
420 return NULL; 432 return NULL;
421 433
422 return rtn; 434 return task_pid(rtn);
423} 435}
424 436
425void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -458,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
458 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
459 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
460 } 472 }
473 put_pid(tmr->it_pid);
461 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
462 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
463} 476}
@@ -471,7 +484,6 @@ sys_timer_create(const clockid_t which_clock,
471{ 484{
472 struct k_itimer *new_timer; 485 struct k_itimer *new_timer;
473 int error, new_timer_id; 486 int error, new_timer_id;
474 struct task_struct *process;
475 sigevent_t event; 487 sigevent_t event;
476 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
477 489
@@ -525,11 +537,9 @@ sys_timer_create(const clockid_t which_clock,
525 goto out; 537 goto out;
526 } 538 }
527 rcu_read_lock(); 539 rcu_read_lock();
528 process = good_sigevent(&event); 540 new_timer->it_pid = get_pid(good_sigevent(&event));
529 if (process)
530 get_task_struct(process);
531 rcu_read_unlock(); 541 rcu_read_unlock();
532 if (!process) { 542 if (!new_timer->it_pid) {
533 error = -EINVAL; 543 error = -EINVAL;
534 goto out; 544 goto out;
535 } 545 }
@@ -537,8 +547,7 @@ sys_timer_create(const clockid_t which_clock,
537 event.sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
538 event.sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
539 event.sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
540 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
541 get_task_struct(process);
542 } 551 }
543 552
544 new_timer->it_sigev_notify = event.sigev_notify; 553 new_timer->it_sigev_notify = event.sigev_notify;
@@ -548,7 +557,7 @@ sys_timer_create(const clockid_t which_clock,
548 new_timer->sigq->info.si_code = SI_TIMER; 557 new_timer->sigq->info.si_code = SI_TIMER;
549 558
550 spin_lock_irq(&current->sighand->siglock); 559 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process; 560 new_timer->it_signal = current->signal;
552 list_add(&new_timer->list, &current->signal->posix_timers); 561 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock); 562 spin_unlock_irq(&current->sighand->siglock);
554 563
@@ -583,8 +592,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
583 timr = idr_find(&posix_timers_id, (int)timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
584 if (timr) { 593 if (timr) {
585 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
586 if (timr->it_process && 595 if (timr->it_signal == current->signal) {
587 same_thread_group(timr->it_process, current)) {
588 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
589 return timr; 597 return timr;
590 } 598 }
@@ -831,8 +839,7 @@ retry_delete:
831 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
832 * they got something (see the lock code above). 840 * they got something (see the lock code above).
833 */ 841 */
834 put_task_struct(timer->it_process); 842 timer->it_signal = NULL;
835 timer->it_process = NULL;
836 843
837 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
838 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -858,8 +865,7 @@ retry_delete:
858 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
859 * they got something (see the lock code above). 866 * they got something (see the lock code above).
860 */ 867 */
861 put_task_struct(timer->it_process); 868 timer->it_signal = NULL;
862 timer->it_process = NULL;
863 869
864 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f051477..97890831e1b5 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
31} 31}
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/profile.c b/kernel/profile.c
index 4cb7d68fed82..d18e2d2654f2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
45int prof_on __read_mostly; 45int prof_on __read_mostly;
46EXPORT_SYMBOL_GPL(prof_on); 46EXPORT_SYMBOL_GPL(prof_on);
47 47
48static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 48static cpumask_var_t prof_cpu_mask;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
51static DEFINE_PER_CPU(int, cpu_profile_flip); 51static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
116 return 0; 117 return 0;
117 } 118 }
118 119
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM;
122
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer) 124 if (prof_buffer)
121 return 0; 125 return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
128 if (prof_buffer) 132 if (prof_buffer)
129 return 0; 133 return 0;
130 134
135 free_cpumask_var(prof_cpu_mask);
131 return -ENOMEM; 136 return -ENOMEM;
132} 137}
133 138
@@ -386,13 +391,15 @@ out_free:
386 return NOTIFY_BAD; 391 return NOTIFY_BAD;
387 case CPU_ONLINE: 392 case CPU_ONLINE:
388 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
389 cpu_set(cpu, prof_cpu_mask); 394 if (prof_cpu_mask != NULL)
395 cpumask_set_cpu(cpu, prof_cpu_mask);
390 break; 396 break;
391 case CPU_UP_CANCELED: 397 case CPU_UP_CANCELED:
392 case CPU_UP_CANCELED_FROZEN: 398 case CPU_UP_CANCELED_FROZEN:
393 case CPU_DEAD: 399 case CPU_DEAD:
394 case CPU_DEAD_FROZEN: 400 case CPU_DEAD_FROZEN:
395 cpu_clear(cpu, prof_cpu_mask); 401 if (prof_cpu_mask != NULL)
402 cpumask_clear_cpu(cpu, prof_cpu_mask);
396 if (per_cpu(cpu_profile_hits, cpu)[0]) { 403 if (per_cpu(cpu_profile_hits, cpu)[0]) {
397 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 404 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
398 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 405 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,7 +437,8 @@ void profile_tick(int type)
430 437
431 if (type == CPU_PROFILING && timer_hook) 438 if (type == CPU_PROFILING && timer_hook)
432 timer_hook(regs); 439 timer_hook(regs);
433 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 440 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
434 profile_hit(type, (void *)profile_pc(regs)); 442 profile_hit(type, (void *)profile_pc(regs));
435} 443}
436 444
@@ -442,7 +450,7 @@ void profile_tick(int type)
442static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 450static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
443 int count, int *eof, void *data) 451 int count, int *eof, void *data)
444{ 452{
445 int len = cpumask_scnprintf(page, count, (cpumask_t *)data); 453 int len = cpumask_scnprintf(page, count, data);
446 if (count - len < 2) 454 if (count - len < 2)
447 return -EINVAL; 455 return -EINVAL;
448 len += sprintf(page + len, "\n"); 456 len += sprintf(page + len, "\n");
@@ -452,16 +460,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
452static int prof_cpu_mask_write_proc(struct file *file, 460static int prof_cpu_mask_write_proc(struct file *file,
453 const char __user *buffer, unsigned long count, void *data) 461 const char __user *buffer, unsigned long count, void *data)
454{ 462{
455 cpumask_t *mask = (cpumask_t *)data; 463 struct cpumask *mask = data;
456 unsigned long full_count = count, err; 464 unsigned long full_count = count, err;
457 cpumask_t new_value; 465 cpumask_var_t new_value;
458 466
459 err = cpumask_parse_user(buffer, count, &new_value); 467 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
460 if (err) 468 return -ENOMEM;
461 return err;
462 469
463 *mask = new_value; 470 err = cpumask_parse_user(buffer, count, new_value);
464 return full_count; 471 if (!err) {
472 cpumask_copy(mask, new_value);
473 err = full_count;
474 }
475 free_cpumask_var(new_value);
476 return err;
465} 477}
466 478
467void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 479void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +484,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
472 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 484 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
473 if (!entry) 485 if (!entry)
474 return; 486 return;
475 entry->data = (void *)&prof_cpu_mask; 487 entry->data = prof_cpu_mask;
476 entry->read_proc = prof_cpu_mask_read_proc; 488 entry->read_proc = prof_cpu_mask_read_proc;
477 entry->write_proc = prof_cpu_mask_write_proc; 489 entry->write_proc = prof_cpu_mask_write_proc;
478} 490}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e0..29dc700e198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
25#include <asm/pgtable.h> 25#include <asm/pgtable.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28
29/*
30 * Initialize a new task whose father had been ptraced.
31 *
32 * Called from copy_process().
33 */
34void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
35{
36 arch_ptrace_fork(child, clone_flags);
37}
38
28/* 39/*
29 * ptrace a task: make the debugger its new parent and 40 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 41 * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
72 child->parent = child->real_parent; 83 child->parent = child->real_parent;
73 list_del_init(&child->ptrace_entry); 84 list_del_init(&child->ptrace_entry);
74 85
86 arch_ptrace_untrace(child);
75 if (task_is_traced(child)) 87 if (task_is_traced(child))
76 ptrace_untrace(child); 88 ptrace_untrace(child);
77} 89}
@@ -115,6 +127,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
115 127
116int __ptrace_may_access(struct task_struct *task, unsigned int mode) 128int __ptrace_may_access(struct task_struct *task, unsigned int mode)
117{ 129{
130 const struct cred *cred = current_cred(), *tcred;
131
118 /* May we inspect the given task? 132 /* May we inspect the given task?
119 * This check is used both for attaching with ptrace 133 * This check is used both for attaching with ptrace
120 * and for allowing access to sensitive information in /proc. 134 * and for allowing access to sensitive information in /proc.
@@ -127,13 +141,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
127 /* Don't let security modules deny introspection */ 141 /* Don't let security modules deny introspection */
128 if (task == current) 142 if (task == current)
129 return 0; 143 return 0;
130 if (((current->uid != task->euid) || 144 rcu_read_lock();
131 (current->uid != task->suid) || 145 tcred = __task_cred(task);
132 (current->uid != task->uid) || 146 if ((cred->uid != tcred->euid ||
133 (current->gid != task->egid) || 147 cred->uid != tcred->suid ||
134 (current->gid != task->sgid) || 148 cred->uid != tcred->uid ||
135 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 149 cred->gid != tcred->egid ||
150 cred->gid != tcred->sgid ||
151 cred->gid != tcred->gid) &&
152 !capable(CAP_SYS_PTRACE)) {
153 rcu_read_unlock();
136 return -EPERM; 154 return -EPERM;
155 }
156 rcu_read_unlock();
137 smp_rmb(); 157 smp_rmb();
138 if (task->mm) 158 if (task->mm)
139 dumpable = get_dumpable(task->mm); 159 dumpable = get_dumpable(task->mm);
@@ -163,6 +183,14 @@ int ptrace_attach(struct task_struct *task)
163 if (same_thread_group(task, current)) 183 if (same_thread_group(task, current))
164 goto out; 184 goto out;
165 185
186 /* Protect exec's credential calculations against our interference;
187 * SUID, SGID and LSM creds get determined differently under ptrace.
188 */
189 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
190 if (retval < 0)
191 goto out;
192
193 retval = -EPERM;
166repeat: 194repeat:
167 /* 195 /*
168 * Nasty, nasty. 196 * Nasty, nasty.
@@ -202,6 +230,7 @@ repeat:
202bad: 230bad:
203 write_unlock_irqrestore(&tasklist_lock, flags); 231 write_unlock_irqrestore(&tasklist_lock, flags);
204 task_unlock(task); 232 task_unlock(task);
233 mutex_unlock(&current->cred_exec_mutex);
205out: 234out:
206 return retval; 235 return retval;
207} 236}
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index c03ca3e61919..6ec495f60ead 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
63 .completed = -300, 63 .completed = -300,
64 .pending = -300, 64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 69 .cur = -300,
70 .completed = -300, 70 .completed = -300,
71 .pending = -300, 71 .pending = -300,
72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
73 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_BITS_NONE,
74}; 74};
75 75
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
85 struct rcu_ctrlblk *rcp) 85 struct rcu_ctrlblk *rcp)
86{ 86{
87 int cpu; 87 int cpu;
88 cpumask_t cpumask;
89 unsigned long flags; 88 unsigned long flags;
90 89
91 set_need_resched(); 90 set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
96 * Don't send IPI to itself. With irqs disabled, 95 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu. 96 * rdp->cpu is the current cpu.
98 * 97 *
99 * cpu_online_map is updated by the _cpu_down() 98 * cpu_online_mask is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled 99 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence 100 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable. 101 * the cpu_online_mask is stable.
103 * 102 *
104 * However, a cpu might have been offlined _just_ before 103 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here. 104 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
107 * notification, leading to the offlined cpu's bit 106 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask. 107 * being set in the rcp->cpumask.
109 * 108 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent 109 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
111 * sending smp_reschedule() to an offlined CPU. 110 * sending smp_reschedule() to an offlined CPU.
112 */ 111 */
113 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 112 for_each_cpu_and(cpu,
114 cpu_clear(rdp->cpu, cpumask); 113 to_cpumask(rcp->cpumask), cpu_online_mask) {
115 for_each_cpu_mask_nr(cpu, cpumask) 114 if (cpu != rdp->cpu)
116 smp_send_reschedule(cpu); 115 smp_send_reschedule(cpu);
116 }
117 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags); 118 spin_unlock_irqrestore(&rcp->lock, flags);
119} 119}
@@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
193 193
194 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
198 } 198 }
199 printk(" (detected by %d, t=%ld jiffies)\n", 199 printk(" (detected by %d, t=%ld jiffies)\n",
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
221 long delta; 221 long delta;
222 222
223 delta = jiffies - rcp->jiffies_stall; 223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 224 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
225 delta >= 0) {
225 226
226 /* We haven't checked in, so go dump stack. */ 227 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp); 228 print_cpu_stall(rcp);
@@ -406,8 +407,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
406 */ 407 */
407static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) 408static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
408{ 409{
409 cpu_clear(cpu, rcp->cpumask); 410 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
410 if (cpus_empty(rcp->cpumask)) { 411 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
411 /* batch completed ! */ 412 /* batch completed ! */
412 rcp->completed = rcp->cur; 413 rcp->completed = rcp->cur;
413 rcu_start_batch(rcp); 414 rcu_start_batch(rcp);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..f9dc8f3720f6 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
164 { "idle", "waitack", "waitzero", "waitmb" }; 164 { "idle", "waitack", "waitzero", "waitmb" };
165#endif /* #ifdef CONFIG_RCU_TRACE */ 165#endif /* #ifdef CONFIG_RCU_TRACE */
166 166
167static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; 167static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 = CPU_BITS_NONE;
168 169
169/* 170/*
170 * Enum and per-CPU flag to determine when each CPU has seen 171 * Enum and per-CPU flag to determine when each CPU has seen
@@ -551,6 +552,16 @@ void rcu_irq_exit(void)
551 } 552 }
552} 553}
553 554
555void rcu_nmi_enter(void)
556{
557 rcu_irq_enter();
558}
559
560void rcu_nmi_exit(void)
561{
562 rcu_irq_exit();
563}
564
554static void dyntick_save_progress_counter(int cpu) 565static void dyntick_save_progress_counter(int cpu)
555{ 566{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 567 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
@@ -748,7 +759,7 @@ rcu_try_flip_idle(void)
748 759
749 /* Now ask each CPU for acknowledgement of the flip. */ 760 /* Now ask each CPU for acknowledgement of the flip. */
750 761
751 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 762 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
752 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 763 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
753 dyntick_save_progress_counter(cpu); 764 dyntick_save_progress_counter(cpu);
754 } 765 }
@@ -766,7 +777,7 @@ rcu_try_flip_waitack(void)
766 int cpu; 777 int cpu;
767 778
768 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 779 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
769 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 780 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
770 if (rcu_try_flip_waitack_needed(cpu) && 781 if (rcu_try_flip_waitack_needed(cpu) &&
771 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 782 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
772 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -798,7 +809,7 @@ rcu_try_flip_waitzero(void)
798 /* Check to see if the sum of the "last" counters is zero. */ 809 /* Check to see if the sum of the "last" counters is zero. */
799 810
800 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 811 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
801 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 812 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
802 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 813 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
803 if (sum != 0) { 814 if (sum != 0) {
804 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 815 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -813,7 +824,7 @@ rcu_try_flip_waitzero(void)
813 smp_mb(); /* ^^^^^^^^^^^^ */ 824 smp_mb(); /* ^^^^^^^^^^^^ */
814 825
815 /* Call for a memory barrier from each CPU. */ 826 /* Call for a memory barrier from each CPU. */
816 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 827 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
817 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 828 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
818 dyntick_save_progress_counter(cpu); 829 dyntick_save_progress_counter(cpu);
819 } 830 }
@@ -833,7 +844,7 @@ rcu_try_flip_waitmb(void)
833 int cpu; 844 int cpu;
834 845
835 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 846 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
836 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 847 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
837 if (rcu_try_flip_waitmb_needed(cpu) && 848 if (rcu_try_flip_waitmb_needed(cpu) &&
838 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 849 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
839 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 850 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1022,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
1022 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; 1033 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1023 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; 1034 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1024 1035
1025 cpu_clear(cpu, rcu_cpu_online_map); 1036 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1026 1037
1027 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1038 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1028 1039
@@ -1062,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
1062 struct rcu_data *rdp; 1073 struct rcu_data *rdp;
1063 1074
1064 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1075 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1065 cpu_set(cpu, rcu_cpu_online_map); 1076 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1066 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1077 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1067 1078
1068 /* 1079 /*
@@ -1420,7 +1431,7 @@ void __init __rcu_init(void)
1420 * We don't need protection against CPU-Hotplug here 1431 * We don't need protection against CPU-Hotplug here
1421 * since 1432 * since
1422 * a) If a CPU comes online while we are iterating over the 1433 * a) If a CPU comes online while we are iterating over the
1423 * cpu_online_map below, we would only end up making a 1434 * cpu_online_mask below, we would only end up making a
1424 * duplicate call to rcu_online_cpu() which sets the corresponding 1435 * duplicate call to rcu_online_cpu() which sets the corresponding
1425 * CPU's mask in the rcu_cpu_online_map. 1436 * CPU's mask in the rcu_cpu_online_map.
1426 * 1437 *
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..3245b40952c6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 if (signal_pending(current)) {
155 mutex_lock(&fullstop_mutex);
156 if (!ACCESS_ONCE(fullstop))
157 fullstop = FULLSTOP_SIGNALED;
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE;
161}
162
139/* 163/*
140 * Allocate an element from the rcu_tortures pool. 164 * Allocate an element from the rcu_tortures pool.
141 */ 165 */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 223static void
200rcu_stutter_wait(void) 224rcu_stutter_wait(void)
201{ 225{
202 while (stutter_pause_test || !rcutorture_runnable) 226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 227 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 228 schedule_timeout_interruptible(1);
205 else 229 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 230 schedule_timeout_interruptible(round_jiffies_relative(HZ));
231 }
207} 232}
208 233
209/* 234/*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 624 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 625 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
603 schedule_timeout_uninterruptible(1); 628 schedule_timeout_uninterruptible(1);
604 return 0; 629 return 0;
605} 630}
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 649 } while (!kthread_should_stop() && !fullstop);
625 650
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
628 schedule_timeout_uninterruptible(1); 653 schedule_timeout_uninterruptible(1);
629 return 0; 654 return 0;
630} 655}
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 760 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 761 del_timer_sync(&t);
737 while (!kthread_should_stop()) 762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
738 schedule_timeout_uninterruptible(1); 763 schedule_timeout_uninterruptible(1);
739 return 0; 764 return 0;
740} 765}
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
831 do { 856 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 857 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 858 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 859 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 861 return 0;
837} 862}
@@ -843,49 +868,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
843 */ 868 */
844static void rcu_torture_shuffle_tasks(void) 869static void rcu_torture_shuffle_tasks(void)
845{ 870{
846 cpumask_t tmp_mask; 871 cpumask_var_t tmp_mask;
847 int i; 872 int i;
848 873
849 cpus_setall(tmp_mask); 874 if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
875 BUG();
876
877 cpumask_setall(tmp_mask);
850 get_online_cpus(); 878 get_online_cpus();
851 879
852 /* No point in shuffling if there is only one online CPU (ex: UP) */ 880 /* No point in shuffling if there is only one online CPU (ex: UP) */
853 if (num_online_cpus() == 1) { 881 if (num_online_cpus() == 1)
854 put_online_cpus(); 882 goto out;
855 return;
856 }
857 883
858 if (rcu_idle_cpu != -1) 884 if (rcu_idle_cpu != -1)
859 cpu_clear(rcu_idle_cpu, tmp_mask); 885 cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
860 886
861 set_cpus_allowed_ptr(current, &tmp_mask); 887 set_cpus_allowed_ptr(current, tmp_mask);
862 888
863 if (reader_tasks) { 889 if (reader_tasks) {
864 for (i = 0; i < nrealreaders; i++) 890 for (i = 0; i < nrealreaders; i++)
865 if (reader_tasks[i]) 891 if (reader_tasks[i])
866 set_cpus_allowed_ptr(reader_tasks[i], 892 set_cpus_allowed_ptr(reader_tasks[i],
867 &tmp_mask); 893 tmp_mask);
868 } 894 }
869 895
870 if (fakewriter_tasks) { 896 if (fakewriter_tasks) {
871 for (i = 0; i < nfakewriters; i++) 897 for (i = 0; i < nfakewriters; i++)
872 if (fakewriter_tasks[i]) 898 if (fakewriter_tasks[i])
873 set_cpus_allowed_ptr(fakewriter_tasks[i], 899 set_cpus_allowed_ptr(fakewriter_tasks[i],
874 &tmp_mask); 900 tmp_mask);
875 } 901 }
876 902
877 if (writer_task) 903 if (writer_task)
878 set_cpus_allowed_ptr(writer_task, &tmp_mask); 904 set_cpus_allowed_ptr(writer_task, tmp_mask);
879 905
880 if (stats_task) 906 if (stats_task)
881 set_cpus_allowed_ptr(stats_task, &tmp_mask); 907 set_cpus_allowed_ptr(stats_task, tmp_mask);
882 908
883 if (rcu_idle_cpu == -1) 909 if (rcu_idle_cpu == -1)
884 rcu_idle_cpu = num_online_cpus() - 1; 910 rcu_idle_cpu = num_online_cpus() - 1;
885 else 911 else
886 rcu_idle_cpu--; 912 rcu_idle_cpu--;
887 913
914out:
888 put_online_cpus(); 915 put_online_cpus();
916 free_cpumask_var(tmp_mask);
889} 917}
890 918
891/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 919/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
@@ -899,7 +927,7 @@ rcu_torture_shuffle(void *arg)
899 do { 927 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 928 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 929 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 930 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 931 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 932 return 0;
905} 933}
@@ -914,10 +942,10 @@ rcu_torture_stutter(void *arg)
914 do { 942 do {
915 schedule_timeout_interruptible(stutter * HZ); 943 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 944 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 945 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 946 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 947 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 948 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 949 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 950 return 0;
923} 951}
@@ -934,12 +962,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 962 stutter, irqreader);
935} 963}
936 964
965static struct notifier_block rcutorture_nb = {
966 .notifier_call = rcutorture_shutdown_notify,
967};
968
937static void 969static void
938rcu_torture_cleanup(void) 970rcu_torture_cleanup(void)
939{ 971{
940 int i; 972 int i;
941 973
942 fullstop = 1; 974 mutex_lock(&fullstop_mutex);
975 if (!fullstop) {
976 /* If being signaled, let it happen, then exit. */
977 mutex_unlock(&fullstop_mutex);
978 schedule_timeout_interruptible(10 * HZ);
979 if (cur_ops->cb_barrier != NULL)
980 cur_ops->cb_barrier();
981 return;
982 }
983 fullstop = FULLSTOP_CLEANUP;
984 mutex_unlock(&fullstop_mutex);
985 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 986 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 987 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 988 kthread_stop(stutter_task);
@@ -1015,6 +1058,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1058 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1059 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1060
1061 mutex_lock(&fullstop_mutex);
1062
1018 /* Process args and tell the world that the torturer is on the job. */ 1063 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1064 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1065 cur_ops = torture_ops[i];
@@ -1024,6 +1069,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1069 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1070 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1071 torture_type);
1072 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1073 return (-EINVAL);
1028 } 1074 }
1029 if (cur_ops->init) 1075 if (cur_ops->init)
@@ -1146,9 +1192,12 @@ rcu_torture_init(void)
1146 goto unwind; 1192 goto unwind;
1147 } 1193 }
1148 } 1194 }
1195 register_reboot_notifier(&rcutorture_nb);
1196 mutex_unlock(&fullstop_mutex);
1149 return 0; 1197 return 0;
1150 1198
1151unwind: 1199unwind:
1200 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1201 rcu_torture_cleanup();
1153 return firsterr; 1202 return firsterr;
1154} 1203}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
83#endif /* #ifdef CONFIG_NO_HZ */
84
85static int blimit = 10; /* Maximum callbacks per softirq. */
86static int qhimark = 10000; /* If this many pending, ignore blimit. */
87static int qlowmark = 100; /* Once only this many pending, use blimit. */
88
89static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
90
91/*
92 * Return the number of RCU batches processed thus far for debug & stats.
93 */
94long rcu_batches_completed(void)
95{
96 return rcu_state.completed;
97}
98EXPORT_SYMBOL_GPL(rcu_batches_completed);
99
100/*
101 * Return the number of RCU BH batches processed thus far for debug & stats.
102 */
103long rcu_batches_completed_bh(void)
104{
105 return rcu_bh_state.completed;
106}
107EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
108
109/*
110 * Does the CPU have callbacks ready to be invoked?
111 */
112static int
113cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
114{
115 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
116}
117
118/*
119 * Does the current CPU require a yet-as-unscheduled grace period?
120 */
121static int
122cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
123{
124 /* ACCESS_ONCE() because we are accessing outside of lock. */
125 return *rdp->nxttail[RCU_DONE_TAIL] &&
126 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
127}
128
129/*
130 * Return the root node of the specified rcu_state structure.
131 */
132static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
133{
134 return &rsp->node[0];
135}
136
137#ifdef CONFIG_SMP
138
139/*
140 * If the specified CPU is offline, tell the caller that it is in
141 * a quiescent state. Otherwise, whack it with a reschedule IPI.
142 * Grace periods can end up waiting on an offline CPU when that
143 * CPU is in the process of coming online -- it will be added to the
144 * rcu_node bitmasks before it actually makes it online. The same thing
145 * can happen while a CPU is in the process of coming online. Because this
146 * race is quite rare, we check for it after detecting that the grace
147 * period has been delayed rather than checking each and every CPU
148 * each and every time we start a new grace period.
149 */
150static int rcu_implicit_offline_qs(struct rcu_data *rdp)
151{
152 /*
153 * If the CPU is offline, it is in a quiescent state. We can
154 * trust its state not to change because interrupts are disabled.
155 */
156 if (cpu_is_offline(rdp->cpu)) {
157 rdp->offline_fqs++;
158 return 1;
159 }
160
161 /* The CPU is online, so send it a reschedule IPI. */
162 if (rdp->cpu != smp_processor_id())
163 smp_send_reschedule(rdp->cpu);
164 else
165 set_need_resched();
166 rdp->resched_ipi++;
167 return 0;
168}
169
170#endif /* #ifdef CONFIG_SMP */
171
172#ifdef CONFIG_NO_HZ
173static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
174
175/**
176 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
177 *
178 * Enter nohz mode, in other words, -leave- the mode in which RCU
179 * read-side critical sections can occur. (Though RCU read-side
180 * critical sections can occur in irq handlers in nohz mode, a possibility
181 * handled by rcu_irq_enter() and rcu_irq_exit()).
182 */
183void rcu_enter_nohz(void)
184{
185 unsigned long flags;
186 struct rcu_dynticks *rdtp;
187
188 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
189 local_irq_save(flags);
190 rdtp = &__get_cpu_var(rcu_dynticks);
191 rdtp->dynticks++;
192 rdtp->dynticks_nesting--;
193 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
194 local_irq_restore(flags);
195}
196
197/*
198 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
199 *
200 * Exit nohz mode, in other words, -enter- the mode in which RCU
201 * read-side critical sections normally occur.
202 */
203void rcu_exit_nohz(void)
204{
205 unsigned long flags;
206 struct rcu_dynticks *rdtp;
207
208 local_irq_save(flags);
209 rdtp = &__get_cpu_var(rcu_dynticks);
210 rdtp->dynticks++;
211 rdtp->dynticks_nesting++;
212 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
213 local_irq_restore(flags);
214 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
215}
216
217/**
218 * rcu_nmi_enter - inform RCU of entry to NMI context
219 *
220 * If the CPU was idle with dynamic ticks active, and there is no
221 * irq handler running, this updates rdtp->dynticks_nmi to let the
222 * RCU grace-period handling know that the CPU is active.
223 */
224void rcu_nmi_enter(void)
225{
226 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
227
228 if (rdtp->dynticks & 0x1)
229 return;
230 rdtp->dynticks_nmi++;
231 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
232 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
233}
234
235/**
236 * rcu_nmi_exit - inform RCU of exit from NMI context
237 *
238 * If the CPU was idle with dynamic ticks active, and there is no
239 * irq handler running, this updates rdtp->dynticks_nmi to let the
240 * RCU grace-period handling know that the CPU is no longer active.
241 */
242void rcu_nmi_exit(void)
243{
244 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
245
246 if (rdtp->dynticks & 0x1)
247 return;
248 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
249 rdtp->dynticks_nmi++;
250 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
251}
252
253/**
254 * rcu_irq_enter - inform RCU of entry to hard irq context
255 *
256 * If the CPU was idle with dynamic ticks active, this updates the
257 * rdtp->dynticks to let the RCU handling know that the CPU is active.
258 */
259void rcu_irq_enter(void)
260{
261 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
262
263 if (rdtp->dynticks_nesting++)
264 return;
265 rdtp->dynticks++;
266 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
267 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
268}
269
270/**
271 * rcu_irq_exit - inform RCU of exit from hard irq context
272 *
273 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
274 * to put let the RCU handling be aware that the CPU is going back to idle
275 * with no ticks.
276 */
277void rcu_irq_exit(void)
278{
279 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
280
281 if (--rdtp->dynticks_nesting)
282 return;
283 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
284 rdtp->dynticks++;
285 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
286
287 /* If the interrupt queued a callback, get out of dyntick mode. */
288 if (__get_cpu_var(rcu_data).nxtlist ||
289 __get_cpu_var(rcu_bh_data).nxtlist)
290 set_need_resched();
291}
292
293/*
294 * Record the specified "completed" value, which is later used to validate
295 * dynticks counter manipulations. Specify "rsp->completed - 1" to
296 * unconditionally invalidate any future dynticks manipulations (which is
297 * useful at the beginning of a grace period).
298 */
299static void dyntick_record_completed(struct rcu_state *rsp, long comp)
300{
301 rsp->dynticks_completed = comp;
302}
303
304#ifdef CONFIG_SMP
305
306/*
307 * Recall the previously recorded value of the completion for dynticks.
308 */
309static long dyntick_recall_completed(struct rcu_state *rsp)
310{
311 return rsp->dynticks_completed;
312}
313
314/*
315 * Snapshot the specified CPU's dynticks counter so that we can later
316 * credit them with an implicit quiescent state. Return 1 if this CPU
317 * is already in a quiescent state courtesy of dynticks idle mode.
318 */
319static int dyntick_save_progress_counter(struct rcu_data *rdp)
320{
321 int ret;
322 int snap;
323 int snap_nmi;
324
325 snap = rdp->dynticks->dynticks;
326 snap_nmi = rdp->dynticks->dynticks_nmi;
327 smp_mb(); /* Order sampling of snap with end of grace period. */
328 rdp->dynticks_snap = snap;
329 rdp->dynticks_nmi_snap = snap_nmi;
330 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
331 if (ret)
332 rdp->dynticks_fqs++;
333 return ret;
334}
335
336/*
337 * Return true if the specified CPU has passed through a quiescent
338 * state by virtue of being in or having passed through an dynticks
339 * idle state since the last call to dyntick_save_progress_counter()
340 * for this same CPU.
341 */
342static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
343{
344 long curr;
345 long curr_nmi;
346 long snap;
347 long snap_nmi;
348
349 curr = rdp->dynticks->dynticks;
350 snap = rdp->dynticks_snap;
351 curr_nmi = rdp->dynticks->dynticks_nmi;
352 snap_nmi = rdp->dynticks_nmi_snap;
353 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
354
355 /*
356 * If the CPU passed through or entered a dynticks idle phase with
357 * no active irq/NMI handlers, then we can safely pretend that the CPU
358 * already acknowledged the request to pass through a quiescent
359 * state. Either way, that CPU cannot possibly be in an RCU
360 * read-side critical section that started before the beginning
361 * of the current RCU grace period.
362 */
363 if ((curr != snap || (curr & 0x1) == 0) &&
364 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
365 rdp->dynticks_fqs++;
366 return 1;
367 }
368
369 /* Go check for the CPU being offline. */
370 return rcu_implicit_offline_qs(rdp);
371}
372
373#endif /* #ifdef CONFIG_SMP */
374
375#else /* #ifdef CONFIG_NO_HZ */
376
377static void dyntick_record_completed(struct rcu_state *rsp, long comp)
378{
379}
380
381#ifdef CONFIG_SMP
382
383/*
384 * If there are no dynticks, then the only way that a CPU can passively
385 * be in a quiescent state is to be offline. Unlike dynticks idle, which
386 * is a point in time during the prior (already finished) grace period,
387 * an offline CPU is always in a quiescent state, and thus can be
388 * unconditionally applied. So just return the current value of completed.
389 */
390static long dyntick_recall_completed(struct rcu_state *rsp)
391{
392 return rsp->completed;
393}
394
395static int dyntick_save_progress_counter(struct rcu_data *rdp)
396{
397 return 0;
398}
399
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{
402 return rcu_implicit_offline_qs(rdp);
403}
404
405#endif /* #ifdef CONFIG_SMP */
406
407#endif /* #else #ifdef CONFIG_NO_HZ */
408
409#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
410
411static void record_gp_stall_check_time(struct rcu_state *rsp)
412{
413 rsp->gp_start = jiffies;
414 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
415}
416
417static void print_other_cpu_stall(struct rcu_state *rsp)
418{
419 int cpu;
420 long delta;
421 unsigned long flags;
422 struct rcu_node *rnp = rcu_get_root(rsp);
423 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
424 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
425
426 /* Only let one CPU complain about others per time interval. */
427
428 spin_lock_irqsave(&rnp->lock, flags);
429 delta = jiffies - rsp->jiffies_stall;
430 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
431 spin_unlock_irqrestore(&rnp->lock, flags);
432 return;
433 }
434 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
435 spin_unlock_irqrestore(&rnp->lock, flags);
436
437 /* OK, time to rat on our buddy... */
438
439 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
440 for (; rnp_cur < rnp_end; rnp_cur++) {
441 if (rnp_cur->qsmask == 0)
442 continue;
443 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
444 if (rnp_cur->qsmask & (1UL << cpu))
445 printk(" %d", rnp_cur->grplo + cpu);
446 }
447 printk(" (detected by %d, t=%ld jiffies)\n",
448 smp_processor_id(), (long)(jiffies - rsp->gp_start));
449 force_quiescent_state(rsp, 0); /* Kick them all. */
450}
451
452static void print_cpu_stall(struct rcu_state *rsp)
453{
454 unsigned long flags;
455 struct rcu_node *rnp = rcu_get_root(rsp);
456
457 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
458 smp_processor_id(), jiffies - rsp->gp_start);
459 dump_stack();
460 spin_lock_irqsave(&rnp->lock, flags);
461 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
462 rsp->jiffies_stall =
463 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
464 spin_unlock_irqrestore(&rnp->lock, flags);
465 set_need_resched(); /* kick ourselves to get things going. */
466}
467
468static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
469{
470 long delta;
471 struct rcu_node *rnp;
472
473 delta = jiffies - rsp->jiffies_stall;
474 rnp = rdp->mynode;
475 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
476
477 /* We haven't checked in, so go dump stack. */
478 print_cpu_stall(rsp);
479
480 } else if (rsp->gpnum != rsp->completed &&
481 delta >= RCU_STALL_RAT_DELAY) {
482
483 /* They had two time units to dump stack, so complain. */
484 print_other_cpu_stall(rsp);
485 }
486}
487
488#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
489
490static void record_gp_stall_check_time(struct rcu_state *rsp)
491{
492}
493
494static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495{
496}
497
498#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
499
500/*
501 * Update CPU-local rcu_data state to record the newly noticed grace period.
502 * This is used both when we started the grace period and when we notice
503 * that someone else started the grace period.
504 */
505static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
506{
507 rdp->qs_pending = 1;
508 rdp->passed_quiesc = 0;
509 rdp->gpnum = rsp->gpnum;
510 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
511 RCU_JIFFIES_TILL_FORCE_QS;
512}
513
514/*
515 * Did someone else start a new RCU grace period start since we last
516 * checked? Update local state appropriately if so. Must be called
517 * on the CPU corresponding to rdp.
518 */
519static int
520check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
521{
522 unsigned long flags;
523 int ret = 0;
524
525 local_irq_save(flags);
526 if (rdp->gpnum != rsp->gpnum) {
527 note_new_gpnum(rsp, rdp);
528 ret = 1;
529 }
530 local_irq_restore(flags);
531 return ret;
532}
533
534/*
535 * Start a new RCU grace period if warranted, re-initializing the hierarchy
536 * in preparation for detecting the next grace period. The caller must hold
537 * the root node's ->lock, which is released before return. Hard irqs must
538 * be disabled.
539 */
540static void
541rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
542 __releases(rcu_get_root(rsp)->lock)
543{
544 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
545 struct rcu_node *rnp = rcu_get_root(rsp);
546 struct rcu_node *rnp_cur;
547 struct rcu_node *rnp_end;
548
549 if (!cpu_needs_another_gp(rsp, rdp)) {
550 spin_unlock_irqrestore(&rnp->lock, flags);
551 return;
552 }
553
554 /* Advance to a new grace period and initialize state. */
555 rsp->gpnum++;
556 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
557 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
558 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
559 RCU_JIFFIES_TILL_FORCE_QS;
560 record_gp_stall_check_time(rsp);
561 dyntick_record_completed(rsp, rsp->completed - 1);
562 note_new_gpnum(rsp, rdp);
563
564 /*
565 * Because we are first, we know that all our callbacks will
566 * be covered by this upcoming grace period, even the ones
567 * that were registered arbitrarily recently.
568 */
569 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
570 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
571
572 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit;
575 spin_unlock_irqrestore(&rnp->lock, flags);
576 return;
577 }
578
579 spin_unlock(&rnp->lock); /* leave irqs disabled. */
580
581
582 /* Exclude any concurrent CPU-hotplug operations. */
583 spin_lock(&rsp->onofflock); /* irqs already disabled. */
584
585 /*
586 * Set the quiescent-state-needed bits in all the non-leaf RCU
587 * nodes for all currently online CPUs. This operation relies
588 * on the layout of the hierarchy within the rsp->node[] array.
589 * Note that other CPUs will access only the leaves of the
590 * hierarchy, which still indicate that no grace period is in
591 * progress. In addition, we have excluded CPU-hotplug operations.
592 *
593 * We therefore do not need to hold any locks. Any required
594 * memory barriers will be supplied by the locks guarding the
595 * leaf rcu_nodes in the hierarchy.
596 */
597
598 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
599 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
600 rnp_cur->qsmask = rnp_cur->qsmaskinit;
601
602 /*
603 * Now set up the leaf nodes. Here we must be careful. First,
604 * we need to hold the lock in order to exclude other CPUs, which
605 * might be contending for the leaf nodes' locks. Second, as
606 * soon as we initialize a given leaf node, its CPUs might run
607 * up the rest of the hierarchy. We must therefore acquire locks
608 * for each node that we touch during this stage. (But we still
609 * are excluding CPU-hotplug operations.)
610 *
611 * Note that the grace period cannot complete until we finish
612 * the initialization process, as there will be at least one
613 * qsmask bit set in the root node until that time, namely the
614 * one corresponding to this CPU.
615 */
616 rnp_end = &rsp->node[NUM_RCU_NODES];
617 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
618 for (; rnp_cur < rnp_end; rnp_cur++) {
619 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
622 }
623
624 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
625 spin_unlock_irqrestore(&rsp->onofflock, flags);
626}
627
628/*
629 * Advance this CPU's callbacks, but only if the current grace period
630 * has ended. This may be called only from the CPU to whom the rdp
631 * belongs.
632 */
633static void
634rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
635{
636 long completed_snap;
637 unsigned long flags;
638
639 local_irq_save(flags);
640 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
641
642 /* Did another grace period end? */
643 if (rdp->completed != completed_snap) {
644
645 /* Advance callbacks. No harm if list empty. */
646 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
647 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
648 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
649
650 /* Remember that we saw this grace-period completion. */
651 rdp->completed = completed_snap;
652 }
653 local_irq_restore(flags);
654}
655
656/*
657 * Similar to cpu_quiet(), for which it is a helper function. Allows
658 * a group of CPUs to be quieted at one go, though all the CPUs in the
659 * group must be represented by the same leaf rcu_node structure.
660 * That structure's lock must be held upon entry, and it is released
661 * before return.
662 */
663static void
664cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
665 unsigned long flags)
666 __releases(rnp->lock)
667{
668 /* Walk up the rcu_node hierarchy. */
669 for (;;) {
670 if (!(rnp->qsmask & mask)) {
671
672 /* Our bit has already been cleared, so done. */
673 spin_unlock_irqrestore(&rnp->lock, flags);
674 return;
675 }
676 rnp->qsmask &= ~mask;
677 if (rnp->qsmask != 0) {
678
679 /* Other bits still set at this level, so done. */
680 spin_unlock_irqrestore(&rnp->lock, flags);
681 return;
682 }
683 mask = rnp->grpmask;
684 if (rnp->parent == NULL) {
685
686 /* No more levels. Exit loop holding root lock. */
687
688 break;
689 }
690 spin_unlock_irqrestore(&rnp->lock, flags);
691 rnp = rnp->parent;
692 spin_lock_irqsave(&rnp->lock, flags);
693 }
694
695 /*
696 * Get here if we are the last CPU to pass through a quiescent
697 * state for this grace period. Clean up and let rcu_start_gp()
698 * start up the next grace period if one is needed. Note that
699 * we still hold rnp->lock, as required by rcu_start_gp(), which
700 * will release it.
701 */
702 rsp->completed = rsp->gpnum;
703 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
704 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
705}
706
707/*
708 * Record a quiescent state for the specified CPU, which must either be
709 * the current CPU or an offline CPU. The lastcomp argument is used to
710 * make sure we are still in the grace period of interest. We don't want
711 * to end the current grace period based on quiescent states detected in
712 * an earlier grace period!
713 */
714static void
715cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
716{
717 unsigned long flags;
718 unsigned long mask;
719 struct rcu_node *rnp;
720
721 rnp = rdp->mynode;
722 spin_lock_irqsave(&rnp->lock, flags);
723 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
724
725 /*
726 * Someone beat us to it for this grace period, so leave.
727 * The race with GP start is resolved by the fact that we
728 * hold the leaf rcu_node lock, so that the per-CPU bits
729 * cannot yet be initialized -- so we would simply find our
730 * CPU's bit already cleared in cpu_quiet_msk() if this race
731 * occurred.
732 */
733 rdp->passed_quiesc = 0; /* try again later! */
734 spin_unlock_irqrestore(&rnp->lock, flags);
735 return;
736 }
737 mask = rdp->grpmask;
738 if ((rnp->qsmask & mask) == 0) {
739 spin_unlock_irqrestore(&rnp->lock, flags);
740 } else {
741 rdp->qs_pending = 0;
742
743 /*
744 * This GP can't end until cpu checks in, so all of our
745 * callbacks can be processed during the next GP.
746 */
747 rdp = rsp->rda[smp_processor_id()];
748 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
749
750 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
751 }
752}
753
754/*
755 * Check to see if there is a new grace period of which this CPU
756 * is not yet aware, and if so, set up local rcu_data state for it.
757 * Otherwise, see if this CPU has just passed through its first
758 * quiescent state for this grace period, and record that fact if so.
759 */
760static void
761rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
762{
763 /* If there is now a new grace period, record and return. */
764 if (check_for_new_grace_period(rsp, rdp))
765 return;
766
767 /*
768 * Does this CPU still need to do its part for current grace period?
769 * If no, return and let the other CPUs do their part as well.
770 */
771 if (!rdp->qs_pending)
772 return;
773
774 /*
775 * Was there a quiescent state since the beginning of the grace
776 * period? If no, then exit and wait for the next call.
777 */
778 if (!rdp->passed_quiesc)
779 return;
780
781 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
782 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
783}
784
785#ifdef CONFIG_HOTPLUG_CPU
786
787/*
788 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
789 * and move all callbacks from the outgoing CPU to the current one.
790 */
791static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
792{
793 int i;
794 unsigned long flags;
795 long lastcomp;
796 unsigned long mask;
797 struct rcu_data *rdp = rsp->rda[cpu];
798 struct rcu_data *rdp_me;
799 struct rcu_node *rnp;
800
801 /* Exclude any attempts to start a new grace period. */
802 spin_lock_irqsave(&rsp->onofflock, flags);
803
804 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
805 rnp = rdp->mynode;
806 mask = rdp->grpmask; /* rnp->grplo is constant. */
807 do {
808 spin_lock(&rnp->lock); /* irqs already disabled. */
809 rnp->qsmaskinit &= ~mask;
810 if (rnp->qsmaskinit != 0) {
811 spin_unlock(&rnp->lock); /* irqs already disabled. */
812 break;
813 }
814 mask = rnp->grpmask;
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 rnp = rnp->parent;
817 } while (rnp != NULL);
818 lastcomp = rsp->completed;
819
820 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
821
822 /* Being offline is a quiescent state, so go record it. */
823 cpu_quiet(cpu, rsp, rdp, lastcomp);
824
825 /*
826 * Move callbacks from the outgoing CPU to the running CPU.
827 * Note that the outgoing CPU is now quiscent, so it is now
828 * (uncharacteristically) safe to access it rcu_data structure.
829 * Note also that we must carefully retain the order of the
830 * outgoing CPU's callbacks in order for rcu_barrier() to work
831 * correctly. Finally, note that we start all the callbacks
832 * afresh, even those that have passed through a grace period
833 * and are therefore ready to invoke. The theory is that hotplug
834 * events are rare, and that if they are frequent enough to
835 * indefinitely delay callbacks, you have far worse things to
836 * be worrying about.
837 */
838 rdp_me = rsp->rda[smp_processor_id()];
839 if (rdp->nxtlist != NULL) {
840 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
841 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
842 rdp->nxtlist = NULL;
843 for (i = 0; i < RCU_NEXT_SIZE; i++)
844 rdp->nxttail[i] = &rdp->nxtlist;
845 rdp_me->qlen += rdp->qlen;
846 rdp->qlen = 0;
847 }
848 local_irq_restore(flags);
849}
850
851/*
852 * Remove the specified CPU from the RCU hierarchy and move any pending
853 * callbacks that it might have to the current CPU. This code assumes
854 * that at least one CPU in the system will remain running at all times.
855 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
856 */
857static void rcu_offline_cpu(int cpu)
858{
859 __rcu_offline_cpu(cpu, &rcu_state);
860 __rcu_offline_cpu(cpu, &rcu_bh_state);
861}
862
863#else /* #ifdef CONFIG_HOTPLUG_CPU */
864
865static void rcu_offline_cpu(int cpu)
866{
867}
868
869#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
870
871/*
872 * Invoke any RCU callbacks that have made it to the end of their grace
873 * period. Thottle as specified by rdp->blimit.
874 */
875static void rcu_do_batch(struct rcu_data *rdp)
876{
877 unsigned long flags;
878 struct rcu_head *next, *list, **tail;
879 int count;
880
881 /* If no callbacks are ready, just return.*/
882 if (!cpu_has_callbacks_ready_to_invoke(rdp))
883 return;
884
885 /*
886 * Extract the list of ready callbacks, disabling to prevent
887 * races with call_rcu() from interrupt handlers.
888 */
889 local_irq_save(flags);
890 list = rdp->nxtlist;
891 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
892 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
893 tail = rdp->nxttail[RCU_DONE_TAIL];
894 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
895 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
896 rdp->nxttail[count] = &rdp->nxtlist;
897 local_irq_restore(flags);
898
899 /* Invoke callbacks. */
900 count = 0;
901 while (list) {
902 next = list->next;
903 prefetch(next);
904 list->func(list);
905 list = next;
906 if (++count >= rdp->blimit)
907 break;
908 }
909
910 local_irq_save(flags);
911
912 /* Update count, and requeue any remaining callbacks. */
913 rdp->qlen -= count;
914 if (list != NULL) {
915 *tail = rdp->nxtlist;
916 rdp->nxtlist = list;
917 for (count = 0; count < RCU_NEXT_SIZE; count++)
918 if (&rdp->nxtlist == rdp->nxttail[count])
919 rdp->nxttail[count] = tail;
920 else
921 break;
922 }
923
924 /* Reinstate batch limit if we have worked down the excess. */
925 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
926 rdp->blimit = blimit;
927
928 local_irq_restore(flags);
929
930 /* Re-raise the RCU softirq if there are callbacks remaining. */
931 if (cpu_has_callbacks_ready_to_invoke(rdp))
932 raise_softirq(RCU_SOFTIRQ);
933}
934
935/*
936 * Check to see if this CPU is in a non-context-switch quiescent state
937 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
938 * Also schedule the RCU softirq handler.
939 *
940 * This function must be called with hardirqs disabled. It is normally
941 * invoked from the scheduling-clock interrupt. If rcu_pending returns
942 * false, there is no point in invoking rcu_check_callbacks().
943 */
944void rcu_check_callbacks(int cpu, int user)
945{
946 if (user ||
947 (idle_cpu(cpu) && !in_softirq() &&
948 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
949
950 /*
951 * Get here if this CPU took its interrupt from user
952 * mode or from the idle loop, and if this is not a
953 * nested interrupt. In this case, the CPU is in
954 * a quiescent state, so count it.
955 *
956 * No memory barrier is required here because both
957 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
958 * only CPU-local variables that other CPUs neither
959 * access nor modify, at least not while the corresponding
960 * CPU is online.
961 */
962
963 rcu_qsctr_inc(cpu);
964 rcu_bh_qsctr_inc(cpu);
965
966 } else if (!in_softirq()) {
967
968 /*
969 * Get here if this CPU did not take its interrupt from
970 * softirq, in other words, if it is not interrupting
971 * a rcu_bh read-side critical section. This is an _bh
972 * critical section, so count it.
973 */
974
975 rcu_bh_qsctr_inc(cpu);
976 }
977 raise_softirq(RCU_SOFTIRQ);
978}
979
980#ifdef CONFIG_SMP
981
982/*
983 * Scan the leaf rcu_node structures, processing dyntick state for any that
984 * have not yet encountered a quiescent state, using the function specified.
985 * Returns 1 if the current grace period ends while scanning (possibly
986 * because we made it end).
987 */
988static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
989 int (*f)(struct rcu_data *))
990{
991 unsigned long bit;
992 int cpu;
993 unsigned long flags;
994 unsigned long mask;
995 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
996 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
997
998 for (; rnp_cur < rnp_end; rnp_cur++) {
999 mask = 0;
1000 spin_lock_irqsave(&rnp_cur->lock, flags);
1001 if (rsp->completed != lastcomp) {
1002 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1003 return 1;
1004 }
1005 if (rnp_cur->qsmask == 0) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 continue;
1008 }
1009 cpu = rnp_cur->grplo;
1010 bit = 1;
1011 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1012 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1013 mask |= bit;
1014 }
1015 if (mask != 0 && rsp->completed == lastcomp) {
1016
1017 /* cpu_quiet_msk() releases rnp_cur->lock. */
1018 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1019 continue;
1020 }
1021 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1022 }
1023 return 0;
1024}
1025
1026/*
1027 * Force quiescent states on reluctant CPUs, and also detect which
1028 * CPUs are in dyntick-idle mode.
1029 */
1030static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1031{
1032 unsigned long flags;
1033 long lastcomp;
1034 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1035 struct rcu_node *rnp = rcu_get_root(rsp);
1036 u8 signaled;
1037
1038 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1039 return; /* No grace period in progress, nothing to force. */
1040 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1041 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1042 return; /* Someone else is already on the job. */
1043 }
1044 if (relaxed &&
1045 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1046 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1047 goto unlock_ret; /* no emergency and done recently. */
1048 rsp->n_force_qs++;
1049 spin_lock(&rnp->lock);
1050 lastcomp = rsp->completed;
1051 signaled = rsp->signaled;
1052 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1053 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1054 RCU_JIFFIES_TILL_FORCE_QS;
1055 if (lastcomp == rsp->gpnum) {
1056 rsp->n_force_qs_ngp++;
1057 spin_unlock(&rnp->lock);
1058 goto unlock_ret; /* no GP in progress, time updated. */
1059 }
1060 spin_unlock(&rnp->lock);
1061 switch (signaled) {
1062 case RCU_GP_INIT:
1063
1064 break; /* grace period still initializing, ignore. */
1065
1066 case RCU_SAVE_DYNTICK:
1067
1068 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1069 break; /* So gcc recognizes the dead code. */
1070
1071 /* Record dyntick-idle state. */
1072 if (rcu_process_dyntick(rsp, lastcomp,
1073 dyntick_save_progress_counter))
1074 goto unlock_ret;
1075
1076 /* Update state, record completion counter. */
1077 spin_lock(&rnp->lock);
1078 if (lastcomp == rsp->completed) {
1079 rsp->signaled = RCU_FORCE_QS;
1080 dyntick_record_completed(rsp, lastcomp);
1081 }
1082 spin_unlock(&rnp->lock);
1083 break;
1084
1085 case RCU_FORCE_QS:
1086
1087 /* Check dyntick-idle state, send IPI to laggarts. */
1088 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1089 rcu_implicit_dynticks_qs))
1090 goto unlock_ret;
1091
1092 /* Leave state in case more forcing is required. */
1093
1094 break;
1095 }
1096unlock_ret:
1097 spin_unlock_irqrestore(&rsp->fqslock, flags);
1098}
1099
1100#else /* #ifdef CONFIG_SMP */
1101
1102static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103{
1104 set_need_resched();
1105}
1106
1107#endif /* #else #ifdef CONFIG_SMP */
1108
1109/*
1110 * This does the RCU processing work from softirq context for the
1111 * specified rcu_state and rcu_data structures. This may be called
1112 * only from the CPU to whom the rdp belongs.
1113 */
1114static void
1115__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1116{
1117 unsigned long flags;
1118
1119 /*
1120 * If an RCU GP has gone long enough, go check for dyntick
1121 * idle CPUs and, if needed, send resched IPIs.
1122 */
1123 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1124 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1125 force_quiescent_state(rsp, 1);
1126
1127 /*
1128 * Advance callbacks in response to end of earlier grace
1129 * period that some other CPU ended.
1130 */
1131 rcu_process_gp_end(rsp, rdp);
1132
1133 /* Update RCU state based on any recent quiescent states. */
1134 rcu_check_quiescent_state(rsp, rdp);
1135
1136 /* Does this CPU require a not-yet-started grace period? */
1137 if (cpu_needs_another_gp(rsp, rdp)) {
1138 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1139 rcu_start_gp(rsp, flags); /* releases above lock */
1140 }
1141
1142 /* If there are callbacks ready, invoke them. */
1143 rcu_do_batch(rdp);
1144}
1145
1146/*
1147 * Do softirq processing for the current CPU.
1148 */
1149static void rcu_process_callbacks(struct softirq_action *unused)
1150{
1151 /*
1152 * Memory references from any prior RCU read-side critical sections
1153 * executed by the interrupted code must be seen before any RCU
1154 * grace-period manipulations below.
1155 */
1156 smp_mb(); /* See above block comment. */
1157
1158 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1159 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1160
1161 /*
1162 * Memory references from any later RCU read-side critical sections
1163 * executed by the interrupted code must be seen after any RCU
1164 * grace-period manipulations above.
1165 */
1166 smp_mb(); /* See above block comment. */
1167}
1168
1169static void
1170__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1171 struct rcu_state *rsp)
1172{
1173 unsigned long flags;
1174 struct rcu_data *rdp;
1175
1176 head->func = func;
1177 head->next = NULL;
1178
1179 smp_mb(); /* Ensure RCU update seen before callback registry. */
1180
1181 /*
1182 * Opportunistically note grace-period endings and beginnings.
1183 * Note that we might see a beginning right after we see an
1184 * end, but never vice versa, since this CPU has to pass through
1185 * a quiescent state betweentimes.
1186 */
1187 local_irq_save(flags);
1188 rdp = rsp->rda[smp_processor_id()];
1189 rcu_process_gp_end(rsp, rdp);
1190 check_for_new_grace_period(rsp, rdp);
1191
1192 /* Add the callback to our list. */
1193 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1194 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1195
1196 /* Start a new grace period if one not already started. */
1197 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1198 unsigned long nestflag;
1199 struct rcu_node *rnp_root = rcu_get_root(rsp);
1200
1201 spin_lock_irqsave(&rnp_root->lock, nestflag);
1202 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1203 }
1204
1205 /* Force the grace period if too many callbacks or too long waiting. */
1206 if (unlikely(++rdp->qlen > qhimark)) {
1207 rdp->blimit = LONG_MAX;
1208 force_quiescent_state(rsp, 0);
1209 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1210 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1211 force_quiescent_state(rsp, 1);
1212 local_irq_restore(flags);
1213}
1214
1215/*
1216 * Queue an RCU callback for invocation after a grace period.
1217 */
1218void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1219{
1220 __call_rcu(head, func, &rcu_state);
1221}
1222EXPORT_SYMBOL_GPL(call_rcu);
1223
1224/*
1225 * Queue an RCU for invocation after a quicker grace period.
1226 */
1227void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1228{
1229 __call_rcu(head, func, &rcu_bh_state);
1230}
1231EXPORT_SYMBOL_GPL(call_rcu_bh);
1232
1233/*
1234 * Check to see if there is any immediate RCU-related work to be done
1235 * by the current CPU, for the specified type of RCU, returning 1 if so.
1236 * The checks are in order of increasing expense: checks that can be
1237 * carried out against CPU-local state are performed first. However,
1238 * we must check for CPU stalls first, else we might not get a chance.
1239 */
1240static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1241{
1242 rdp->n_rcu_pending++;
1243
1244 /* Check for CPU stalls, if enabled. */
1245 check_cpu_stall(rsp, rdp);
1246
1247 /* Is the RCU core waiting for a quiescent state from this CPU? */
1248 if (rdp->qs_pending)
1249 return 1;
1250
1251 /* Does this CPU have callbacks ready to invoke? */
1252 if (cpu_has_callbacks_ready_to_invoke(rdp))
1253 return 1;
1254
1255 /* Has RCU gone idle with this CPU needing another grace period? */
1256 if (cpu_needs_another_gp(rsp, rdp))
1257 return 1;
1258
1259 /* Has another RCU grace period completed? */
1260 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1261 return 1;
1262
1263 /* Has a new RCU grace period started? */
1264 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1265 return 1;
1266
1267 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1268 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1269 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1270 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1271 return 1;
1272
1273 /* nothing to do */
1274 return 0;
1275}
1276
1277/*
1278 * Check to see if there is any immediate RCU-related work to be done
1279 * by the current CPU, returning 1 if so. This function is part of the
1280 * RCU implementation; it is -not- an exported member of the RCU API.
1281 */
1282int rcu_pending(int cpu)
1283{
1284 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1285 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1286}
1287
1288/*
1289 * Check to see if any future RCU-related work will need to be done
1290 * by the current CPU, even if none need be done immediately, returning
1291 * 1 if so. This function is part of the RCU implementation; it is -not-
1292 * an exported member of the RCU API.
1293 */
1294int rcu_needs_cpu(int cpu)
1295{
1296 /* RCU callbacks either ready or pending? */
1297 return per_cpu(rcu_data, cpu).nxtlist ||
1298 per_cpu(rcu_bh_data, cpu).nxtlist;
1299}
1300
1301/*
1302 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1303 * approach so that we don't have to worry about how long the CPU has
1304 * been gone, or whether it ever was online previously. We do trust the
1305 * ->mynode field, as it is constant for a given struct rcu_data and
1306 * initialized during early boot.
1307 *
1308 * Note that only one online or offline event can be happening at a given
1309 * time. Note also that we can accept some slop in the rsp->completed
1310 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet.
1312 */
1313static void
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{
1316 unsigned long flags;
1317 int i;
1318 long lastcomp;
1319 unsigned long mask;
1320 struct rcu_data *rdp = rsp->rda[cpu];
1321 struct rcu_node *rnp = rcu_get_root(rsp);
1322
1323 /* Set up local state, ensuring consistent view of global state. */
1324 spin_lock_irqsave(&rnp->lock, flags);
1325 lastcomp = rsp->completed;
1326 rdp->completed = lastcomp;
1327 rdp->gpnum = lastcomp;
1328 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1329 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1330 rdp->beenonline = 1; /* We have now been online. */
1331 rdp->passed_quiesc_completed = lastcomp - 1;
1332 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1333 rdp->nxtlist = NULL;
1334 for (i = 0; i < RCU_NEXT_SIZE; i++)
1335 rdp->nxttail[i] = &rdp->nxtlist;
1336 rdp->qlen = 0;
1337 rdp->blimit = blimit;
1338#ifdef CONFIG_NO_HZ
1339 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1340#endif /* #ifdef CONFIG_NO_HZ */
1341 rdp->cpu = cpu;
1342 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1343
1344 /*
1345 * A new grace period might start here. If so, we won't be part
1346 * of it, but that is OK, as we are currently in a quiescent state.
1347 */
1348
1349 /* Exclude any attempts to start a new GP on large systems. */
1350 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1351
1352 /* Add CPU to rcu_node bitmasks. */
1353 rnp = rdp->mynode;
1354 mask = rdp->grpmask;
1355 do {
1356 /* Exclude any attempts to start a new GP on small systems. */
1357 spin_lock(&rnp->lock); /* irqs already disabled. */
1358 rnp->qsmaskinit |= mask;
1359 mask = rnp->grpmask;
1360 spin_unlock(&rnp->lock); /* irqs already disabled. */
1361 rnp = rnp->parent;
1362 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1363
1364 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1365
1366 /*
1367 * A new grace period might start here. If so, we will be part of
1368 * it, and its gpnum will be greater than ours, so we will
1369 * participate. It is also possible for the gpnum to have been
1370 * incremented before this function was called, and the bitmasks
1371 * to not be filled out until now, in which case we will also
1372 * participate due to our gpnum being behind.
1373 */
1374
1375 /* Since it is coming online, the CPU is in a quiescent state. */
1376 cpu_quiet(cpu, rsp, rdp, lastcomp);
1377 local_irq_restore(flags);
1378}
1379
1380static void __cpuinit rcu_online_cpu(int cpu)
1381{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1392}
1393
1394/*
1395 * Handle CPU online/offline notifcation events.
1396 */
1397static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1398 unsigned long action, void *hcpu)
1399{
1400 long cpu = (long)hcpu;
1401
1402 switch (action) {
1403 case CPU_UP_PREPARE:
1404 case CPU_UP_PREPARE_FROZEN:
1405 rcu_online_cpu(cpu);
1406 break;
1407 case CPU_DEAD:
1408 case CPU_DEAD_FROZEN:
1409 case CPU_UP_CANCELED:
1410 case CPU_UP_CANCELED_FROZEN:
1411 rcu_offline_cpu(cpu);
1412 break;
1413 default:
1414 break;
1415 }
1416 return NOTIFY_OK;
1417}
1418
1419/*
1420 * Compute the per-level fanout, either using the exact fanout specified
1421 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1422 */
1423#ifdef CONFIG_RCU_FANOUT_EXACT
1424static void __init rcu_init_levelspread(struct rcu_state *rsp)
1425{
1426 int i;
1427
1428 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1429 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1430}
1431#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1432static void __init rcu_init_levelspread(struct rcu_state *rsp)
1433{
1434 int ccur;
1435 int cprv;
1436 int i;
1437
1438 cprv = NR_CPUS;
1439 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1440 ccur = rsp->levelcnt[i];
1441 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1442 cprv = ccur;
1443 }
1444}
1445#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1446
1447/*
1448 * Helper function for rcu_init() that initializes one rcu_state structure.
1449 */
1450static void __init rcu_init_one(struct rcu_state *rsp)
1451{
1452 int cpustride = 1;
1453 int i;
1454 int j;
1455 struct rcu_node *rnp;
1456
1457 /* Initialize the level-tracking arrays. */
1458
1459 for (i = 1; i < NUM_RCU_LVLS; i++)
1460 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1461 rcu_init_levelspread(rsp);
1462
1463 /* Initialize the elements themselves, starting from the leaves. */
1464
1465 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1466 cpustride *= rsp->levelspread[i];
1467 rnp = rsp->level[i];
1468 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1469 spin_lock_init(&rnp->lock);
1470 rnp->qsmask = 0;
1471 rnp->qsmaskinit = 0;
1472 rnp->grplo = j * cpustride;
1473 rnp->grphi = (j + 1) * cpustride - 1;
1474 if (rnp->grphi >= NR_CPUS)
1475 rnp->grphi = NR_CPUS - 1;
1476 if (i == 0) {
1477 rnp->grpnum = 0;
1478 rnp->grpmask = 0;
1479 rnp->parent = NULL;
1480 } else {
1481 rnp->grpnum = j % rsp->levelspread[i - 1];
1482 rnp->grpmask = 1UL << rnp->grpnum;
1483 rnp->parent = rsp->level[i - 1] +
1484 j / rsp->levelspread[i - 1];
1485 }
1486 rnp->level = i;
1487 }
1488 }
1489}
1490
1491/*
1492 * Helper macro for __rcu_init(). To be used nowhere else!
1493 * Assigns leaf node pointers into each CPU's rcu_data structure.
1494 */
1495#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1496do { \
1497 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1498 j = 0; \
1499 for_each_possible_cpu(i) { \
1500 if (i > rnp[j].grphi) \
1501 j++; \
1502 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1503 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1504 } \
1505} while (0)
1506
1507static struct notifier_block __cpuinitdata rcu_nb = {
1508 .notifier_call = rcu_cpu_notify,
1509};
1510
1511void __init __rcu_init(void)
1512{
1513 int i; /* All used by RCU_DATA_PTR_INIT(). */
1514 int j;
1515 struct rcu_node *rnp;
1516
1517 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1518#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1519 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1520#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1521 rcu_init_one(&rcu_state);
1522 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1523 rcu_init_one(&rcu_bh_state);
1524 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1525
1526 for_each_online_cpu(i)
1527 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1528 /* Register notifier for non-boot CPUs */
1529 register_cpu_notifier(&rcu_nb);
1530 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1531}
1532
1533module_param(blimit, int, 0);
1534module_param(qhimark, int, 0);
1535module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 855 continue;
856 /*
857 * if a resource is "BUSY", it's not a hardware resource
858 * but a driver mapping of such a resource; we don't want
859 * to warn for those; some drivers legitimately map only
860 * partial hardware resources. (example: vesafb)
861 */
862 if (p->flags & IORESOURCE_BUSY)
863 continue;
864
856 printk(KERN_WARNING "resource map sanity check conflict: " 865 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 866 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 867 (unsigned long long)addr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 756d981d91a4..27ba1d642f0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -209,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
209 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
210 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
211 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
212 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
213} 212}
214 213
215static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -361,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
361 struct task_group *tg; 360 struct task_group *tg;
362 361
363#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
364 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
365#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
366 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
367 struct task_group, css); 368 struct task_group, css);
@@ -610,6 +611,8 @@ struct rq {
610#ifdef CONFIG_SCHEDSTATS 611#ifdef CONFIG_SCHEDSTATS
611 /* latency stats */ 612 /* latency stats */
612 struct sched_info rq_sched_info; 613 struct sched_info rq_sched_info;
614 unsigned long long rq_cpu_time;
615 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
613 616
614 /* sys_sched_yield() stats */ 617 /* sys_sched_yield() stats */
615 unsigned int yld_exp_empty; 618 unsigned int yld_exp_empty;
@@ -1143,7 +1146,6 @@ static void init_rq_hrtick(struct rq *rq)
1143 1146
1144 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1147 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1145 rq->hrtick_timer.function = hrtick; 1148 rq->hrtick_timer.function = hrtick;
1146 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1147} 1149}
1148#else /* CONFIG_SCHED_HRTICK */ 1150#else /* CONFIG_SCHED_HRTICK */
1149static inline void hrtick_clear(struct rq *rq) 1151static inline void hrtick_clear(struct rq *rq)
@@ -1871,6 +1873,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1871 1873
1872 clock_offset = old_rq->clock - new_rq->clock; 1874 clock_offset = old_rq->clock - new_rq->clock;
1873 1875
1876 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1877
1874#ifdef CONFIG_SCHEDSTATS 1878#ifdef CONFIG_SCHEDSTATS
1875 if (p->se.wait_start) 1879 if (p->se.wait_start)
1876 p->se.wait_start -= clock_offset; 1880 p->se.wait_start -= clock_offset;
@@ -2277,6 +2281,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2277 2281
2278 smp_wmb(); 2282 smp_wmb();
2279 rq = task_rq_lock(p, &flags); 2283 rq = task_rq_lock(p, &flags);
2284 update_rq_clock(rq);
2280 old_state = p->state; 2285 old_state = p->state;
2281 if (!(old_state & state)) 2286 if (!(old_state & state))
2282 goto out; 2287 goto out;
@@ -2334,12 +2339,11 @@ out_activate:
2334 schedstat_inc(p, se.nr_wakeups_local); 2339 schedstat_inc(p, se.nr_wakeups_local);
2335 else 2340 else
2336 schedstat_inc(p, se.nr_wakeups_remote); 2341 schedstat_inc(p, se.nr_wakeups_remote);
2337 update_rq_clock(rq);
2338 activate_task(rq, p, 1); 2342 activate_task(rq, p, 1);
2339 success = 1; 2343 success = 1;
2340 2344
2341out_running: 2345out_running:
2342 trace_sched_wakeup(rq, p); 2346 trace_sched_wakeup(rq, p, success);
2343 check_preempt_curr(rq, p, sync); 2347 check_preempt_curr(rq, p, sync);
2344 2348
2345 p->state = TASK_RUNNING; 2349 p->state = TASK_RUNNING;
@@ -2472,7 +2476,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2472 p->sched_class->task_new(rq, p); 2476 p->sched_class->task_new(rq, p);
2473 inc_nr_running(rq); 2477 inc_nr_running(rq);
2474 } 2478 }
2475 trace_sched_wakeup_new(rq, p); 2479 trace_sched_wakeup_new(rq, p, 1);
2476 check_preempt_curr(rq, p, 0); 2480 check_preempt_curr(rq, p, 0);
2477#ifdef CONFIG_SMP 2481#ifdef CONFIG_SMP
2478 if (p->sched_class->task_wake_up) 2482 if (p->sched_class->task_wake_up)
@@ -2851,7 +2855,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2851 || unlikely(!cpu_active(dest_cpu))) 2855 || unlikely(!cpu_active(dest_cpu)))
2852 goto out; 2856 goto out;
2853 2857
2854 trace_sched_migrate_task(rq, p, dest_cpu);
2855 /* force the process onto the specified CPU */ 2858 /* force the process onto the specified CPU */
2856 if (migrate_task(p, dest_cpu, &req)) { 2859 if (migrate_task(p, dest_cpu, &req)) {
2857 /* Need to wait for migration thread (might exit: take ref). */ 2860 /* Need to wait for migration thread (might exit: take ref). */
@@ -5187,6 +5190,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5187 set_load_weight(p); 5190 set_load_weight(p);
5188} 5191}
5189 5192
5193/*
5194 * check the target process has a UID that matches the current process's
5195 */
5196static bool check_same_owner(struct task_struct *p)
5197{
5198 const struct cred *cred = current_cred(), *pcred;
5199 bool match;
5200
5201 rcu_read_lock();
5202 pcred = __task_cred(p);
5203 match = (cred->euid == pcred->euid ||
5204 cred->euid == pcred->uid);
5205 rcu_read_unlock();
5206 return match;
5207}
5208
5190static int __sched_setscheduler(struct task_struct *p, int policy, 5209static int __sched_setscheduler(struct task_struct *p, int policy,
5191 struct sched_param *param, bool user) 5210 struct sched_param *param, bool user)
5192{ 5211{
@@ -5246,8 +5265,7 @@ recheck:
5246 return -EPERM; 5265 return -EPERM;
5247 5266
5248 /* can't change other user's priorities */ 5267 /* can't change other user's priorities */
5249 if ((current->euid != p->euid) && 5268 if (!check_same_owner(p))
5250 (current->euid != p->uid))
5251 return -EPERM; 5269 return -EPERM;
5252 } 5270 }
5253 5271
@@ -5486,8 +5504,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5486 goto out_free_cpus_allowed; 5504 goto out_free_cpus_allowed;
5487 } 5505 }
5488 retval = -EPERM; 5506 retval = -EPERM;
5489 if ((current->euid != p->euid) && (current->euid != p->uid) && 5507 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5490 !capable(CAP_SYS_NICE))
5491 goto out_unlock; 5508 goto out_unlock;
5492 5509
5493 retval = security_task_setscheduler(p, 0, NULL); 5510 retval = security_task_setscheduler(p, 0, NULL);
@@ -9423,6 +9440,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9423 kfree(ca); 9440 kfree(ca);
9424} 9441}
9425 9442
9443static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9444{
9445 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9446 u64 data;
9447
9448#ifndef CONFIG_64BIT
9449 /*
9450 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9451 */
9452 spin_lock_irq(&cpu_rq(cpu)->lock);
9453 data = *cpuusage;
9454 spin_unlock_irq(&cpu_rq(cpu)->lock);
9455#else
9456 data = *cpuusage;
9457#endif
9458
9459 return data;
9460}
9461
9462static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9463{
9464 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9465
9466#ifndef CONFIG_64BIT
9467 /*
9468 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9469 */
9470 spin_lock_irq(&cpu_rq(cpu)->lock);
9471 *cpuusage = val;
9472 spin_unlock_irq(&cpu_rq(cpu)->lock);
9473#else
9474 *cpuusage = val;
9475#endif
9476}
9477
9426/* return total cpu usage (in nanoseconds) of a group */ 9478/* return total cpu usage (in nanoseconds) of a group */
9427static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9479static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9428{ 9480{
@@ -9430,17 +9482,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9430 u64 totalcpuusage = 0; 9482 u64 totalcpuusage = 0;
9431 int i; 9483 int i;
9432 9484
9433 for_each_possible_cpu(i) { 9485 for_each_present_cpu(i)
9434 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9486 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9435
9436 /*
9437 * Take rq->lock to make 64-bit addition safe on 32-bit
9438 * platforms.
9439 */
9440 spin_lock_irq(&cpu_rq(i)->lock);
9441 totalcpuusage += *cpuusage;
9442 spin_unlock_irq(&cpu_rq(i)->lock);
9443 }
9444 9487
9445 return totalcpuusage; 9488 return totalcpuusage;
9446} 9489}
@@ -9457,23 +9500,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9457 goto out; 9500 goto out;
9458 } 9501 }
9459 9502
9460 for_each_possible_cpu(i) { 9503 for_each_present_cpu(i)
9461 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9504 cpuacct_cpuusage_write(ca, i, 0);
9462 9505
9463 spin_lock_irq(&cpu_rq(i)->lock);
9464 *cpuusage = 0;
9465 spin_unlock_irq(&cpu_rq(i)->lock);
9466 }
9467out: 9506out:
9468 return err; 9507 return err;
9469} 9508}
9470 9509
9510static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9511 struct seq_file *m)
9512{
9513 struct cpuacct *ca = cgroup_ca(cgroup);
9514 u64 percpu;
9515 int i;
9516
9517 for_each_present_cpu(i) {
9518 percpu = cpuacct_cpuusage_read(ca, i);
9519 seq_printf(m, "%llu ", (unsigned long long) percpu);
9520 }
9521 seq_printf(m, "\n");
9522 return 0;
9523}
9524
9471static struct cftype files[] = { 9525static struct cftype files[] = {
9472 { 9526 {
9473 .name = "usage", 9527 .name = "usage",
9474 .read_u64 = cpuusage_read, 9528 .read_u64 = cpuusage_read,
9475 .write_u64 = cpuusage_write, 9529 .write_u64 = cpuusage_write,
9476 }, 9530 },
9531 {
9532 .name = "usage_percpu",
9533 .read_seq_string = cpuacct_percpu_seq_read,
9534 },
9535
9477}; 9536};
9478 9537
9479static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9538static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 36b5e34fa99e..56c0efe902a7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
492 * overflow on 32 bits): 492 * overflow on 32 bits):
493 */ 493 */
494 delta_exec = (unsigned long)(now - curr->exec_start); 494 delta_exec = (unsigned long)(now - curr->exec_start);
495 if (!delta_exec)
496 return;
495 497
496 __update_curr(cfs_rq, curr, delta_exec); 498 __update_curr(cfs_rq, curr, delta_exec);
497 curr->exec_start = now; 499 curr->exec_start = now;
@@ -1361,12 +1363,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1361{ 1363{
1362 struct task_struct *curr = rq->curr; 1364 struct task_struct *curr = rq->curr;
1363 struct sched_entity *se = &curr->se, *pse = &p->se; 1365 struct sched_entity *se = &curr->se, *pse = &p->se;
1366 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1364 1367
1365 if (unlikely(rt_prio(p->prio))) { 1368 update_curr(cfs_rq);
1366 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1367 1369
1368 update_rq_clock(rq); 1370 if (unlikely(rt_prio(p->prio))) {
1369 update_curr(cfs_rq);
1370 resched_task(curr); 1371 resched_task(curr);
1371 return; 1372 return;
1372 } 1373 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1bbd99014011..833b6d44483c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5fcf0e184586..f2773b5d1226 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -124,7 +124,7 @@ static inline void
124rq_sched_info_depart(struct rq *rq, unsigned long long delta) 124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{ 125{
126 if (rq) 126 if (rq)
127 rq->rq_sched_info.cpu_time += delta; 127 rq->rq_cpu_time += delta;
128} 128}
129 129
130static inline void 130static inline void
@@ -237,7 +237,6 @@ static inline void sched_info_depart(struct task_struct *t)
237 unsigned long long delta = task_rq(t)->clock - 237 unsigned long long delta = task_rq(t)->clock -
238 t->sched_info.last_arrival; 238 t->sched_info.last_arrival;
239 239
240 t->sched_info.cpu_time += delta;
241 rq_sched_info_depart(task_rq(t), delta); 240 rq_sched_info_depart(task_rq(t), delta);
242 241
243 if (t->state == TASK_RUNNING) 242 if (t->state == TASK_RUNNING)
diff --git a/kernel/signal.c b/kernel/signal.c
index e9afe63da24b..8e95855ff3cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -179,6 +179,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
179 return sig; 179 return sig;
180} 180}
181 181
182/*
183 * allocate a new signal queue record
184 * - this may be called without locks if and only if t == current, otherwise an
185 * appopriate lock must be held to stop the target task from exiting
186 */
182static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 187static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
183 int override_rlimit) 188 int override_rlimit)
184{ 189{
@@ -186,11 +191,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
186 struct user_struct *user; 191 struct user_struct *user;
187 192
188 /* 193 /*
189 * In order to avoid problems with "switch_user()", we want to make 194 * We won't get problems with the target's UID changing under us
190 * sure that the compiler doesn't re-load "t->user" 195 * because changing it requires RCU be used, and if t != current, the
196 * caller must be holding the RCU readlock (by way of a spinlock) and
197 * we use RCU protection here
191 */ 198 */
192 user = t->user; 199 user = get_uid(__task_cred(t)->user);
193 barrier();
194 atomic_inc(&user->sigpending); 200 atomic_inc(&user->sigpending);
195 if (override_rlimit || 201 if (override_rlimit ||
196 atomic_read(&user->sigpending) <= 202 atomic_read(&user->sigpending) <=
@@ -198,12 +204,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
198 q = kmem_cache_alloc(sigqueue_cachep, flags); 204 q = kmem_cache_alloc(sigqueue_cachep, flags);
199 if (unlikely(q == NULL)) { 205 if (unlikely(q == NULL)) {
200 atomic_dec(&user->sigpending); 206 atomic_dec(&user->sigpending);
207 free_uid(user);
201 } else { 208 } else {
202 INIT_LIST_HEAD(&q->list); 209 INIT_LIST_HEAD(&q->list);
203 q->flags = 0; 210 q->flags = 0;
204 q->user = get_uid(user); 211 q->user = user;
205 } 212 }
206 return(q); 213
214 return q;
207} 215}
208 216
209static void __sigqueue_free(struct sigqueue *q) 217static void __sigqueue_free(struct sigqueue *q)
@@ -564,10 +572,12 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
564 572
565/* 573/*
566 * Bad permissions for sending the signal 574 * Bad permissions for sending the signal
575 * - the caller must hold at least the RCU read lock
567 */ 576 */
568static int check_kill_permission(int sig, struct siginfo *info, 577static int check_kill_permission(int sig, struct siginfo *info,
569 struct task_struct *t) 578 struct task_struct *t)
570{ 579{
580 const struct cred *cred = current_cred(), *tcred;
571 struct pid *sid; 581 struct pid *sid;
572 int error; 582 int error;
573 583
@@ -581,8 +591,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
581 if (error) 591 if (error)
582 return error; 592 return error;
583 593
584 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && 594 tcred = __task_cred(t);
585 (current->uid ^ t->suid) && (current->uid ^ t->uid) && 595 if ((cred->euid ^ tcred->suid) &&
596 (cred->euid ^ tcred->uid) &&
597 (cred->uid ^ tcred->suid) &&
598 (cred->uid ^ tcred->uid) &&
586 !capable(CAP_KILL)) { 599 !capable(CAP_KILL)) {
587 switch (sig) { 600 switch (sig) {
588 case SIGCONT: 601 case SIGCONT:
@@ -846,7 +859,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
846 q->info.si_errno = 0; 859 q->info.si_errno = 0;
847 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
848 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_pid_vnr(current);
849 q->info.si_uid = current->uid; 862 q->info.si_uid = current_uid();
850 break; 863 break;
851 case (unsigned long) SEND_SIG_PRIV: 864 case (unsigned long) SEND_SIG_PRIV:
852 q->info.si_signo = sig; 865 q->info.si_signo = sig;
@@ -1010,6 +1023,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1010 return sighand; 1023 return sighand;
1011} 1024}
1012 1025
1026/*
1027 * send signal info to all the members of a group
1028 * - the caller must hold the RCU read lock at least
1029 */
1013int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1030int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1014{ 1031{
1015 unsigned long flags; 1032 unsigned long flags;
@@ -1031,8 +1048,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1031/* 1048/*
1032 * __kill_pgrp_info() sends a signal to a process group: this is what the tty 1049 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
1033 * control characters do (^C, ^Z etc) 1050 * control characters do (^C, ^Z etc)
1051 * - the caller must hold at least a readlock on tasklist_lock
1034 */ 1052 */
1035
1036int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) 1053int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1037{ 1054{
1038 struct task_struct *p = NULL; 1055 struct task_struct *p = NULL;
@@ -1088,6 +1105,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1088{ 1105{
1089 int ret = -EINVAL; 1106 int ret = -EINVAL;
1090 struct task_struct *p; 1107 struct task_struct *p;
1108 const struct cred *pcred;
1091 1109
1092 if (!valid_signal(sig)) 1110 if (!valid_signal(sig))
1093 return ret; 1111 return ret;
@@ -1098,9 +1116,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1098 ret = -ESRCH; 1116 ret = -ESRCH;
1099 goto out_unlock; 1117 goto out_unlock;
1100 } 1118 }
1101 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 1119 pcred = __task_cred(p);
1102 && (euid != p->suid) && (euid != p->uid) 1120 if ((info == SEND_SIG_NOINFO ||
1103 && (uid != p->suid) && (uid != p->uid)) { 1121 (!is_si_special(info) && SI_FROMUSER(info))) &&
1122 euid != pcred->suid && euid != pcred->uid &&
1123 uid != pcred->suid && uid != pcred->uid) {
1104 ret = -EPERM; 1124 ret = -EPERM;
1105 goto out_unlock; 1125 goto out_unlock;
1106 } 1126 }
@@ -1371,10 +1391,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1371 */ 1391 */
1372 rcu_read_lock(); 1392 rcu_read_lock();
1373 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1393 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1394 info.si_uid = __task_cred(tsk)->uid;
1374 rcu_read_unlock(); 1395 rcu_read_unlock();
1375 1396
1376 info.si_uid = tsk->uid;
1377
1378 thread_group_cputime(tsk, &cputime); 1397 thread_group_cputime(tsk, &cputime);
1379 info.si_utime = cputime_to_jiffies(cputime.utime); 1398 info.si_utime = cputime_to_jiffies(cputime.utime);
1380 info.si_stime = cputime_to_jiffies(cputime.stime); 1399 info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1442,10 +1461,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1442 */ 1461 */
1443 rcu_read_lock(); 1462 rcu_read_lock();
1444 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1463 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1464 info.si_uid = __task_cred(tsk)->uid;
1445 rcu_read_unlock(); 1465 rcu_read_unlock();
1446 1466
1447 info.si_uid = tsk->uid;
1448
1449 info.si_utime = cputime_to_clock_t(tsk->utime); 1467 info.si_utime = cputime_to_clock_t(tsk->utime);
1450 info.si_stime = cputime_to_clock_t(tsk->stime); 1468 info.si_stime = cputime_to_clock_t(tsk->stime);
1451 1469
@@ -1600,7 +1618,7 @@ void ptrace_notify(int exit_code)
1600 info.si_signo = SIGTRAP; 1618 info.si_signo = SIGTRAP;
1601 info.si_code = exit_code; 1619 info.si_code = exit_code;
1602 info.si_pid = task_pid_vnr(current); 1620 info.si_pid = task_pid_vnr(current);
1603 info.si_uid = current->uid; 1621 info.si_uid = current_uid();
1604 1622
1605 /* Let the debugger run. */ 1623 /* Let the debugger run. */
1606 spin_lock_irq(&current->sighand->siglock); 1624 spin_lock_irq(&current->sighand->siglock);
@@ -1712,7 +1730,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1712 info->si_errno = 0; 1730 info->si_errno = 0;
1713 info->si_code = SI_USER; 1731 info->si_code = SI_USER;
1714 info->si_pid = task_pid_vnr(current->parent); 1732 info->si_pid = task_pid_vnr(current->parent);
1715 info->si_uid = current->parent->uid; 1733 info->si_uid = task_uid(current->parent);
1716 } 1734 }
1717 1735
1718 /* If the (new) signal is now blocked, requeue it. */ 1736 /* If the (new) signal is now blocked, requeue it. */
@@ -2213,7 +2231,7 @@ sys_kill(pid_t pid, int sig)
2213 info.si_errno = 0; 2231 info.si_errno = 0;
2214 info.si_code = SI_USER; 2232 info.si_code = SI_USER;
2215 info.si_pid = task_tgid_vnr(current); 2233 info.si_pid = task_tgid_vnr(current);
2216 info.si_uid = current->uid; 2234 info.si_uid = current_uid();
2217 2235
2218 return kill_something_info(sig, &info, pid); 2236 return kill_something_info(sig, &info, pid);
2219} 2237}
@@ -2230,7 +2248,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2230 info.si_errno = 0; 2248 info.si_errno = 0;
2231 info.si_code = SI_TKILL; 2249 info.si_code = SI_TKILL;
2232 info.si_pid = task_tgid_vnr(current); 2250 info.si_pid = task_tgid_vnr(current);
2233 info.si_uid = current->uid; 2251 info.si_uid = current_uid();
2234 2252
2235 rcu_read_lock(); 2253 rcu_read_lock();
2236 p = find_task_by_vpid(pid); 2254 p = find_task_by_vpid(pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c55..5cfa0e5e3e88 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
24 struct call_single_data csd; 24 struct call_single_data csd;
25 spinlock_t lock; 25 spinlock_t lock;
26 unsigned int refs; 26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head; 27 struct rcu_head rcu_head;
28 unsigned long cpumask_bits[];
29}; 29};
30 30
31struct call_single_queue { 31struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) { 110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 111 int refs;
112 112
113 if (!cpu_isset(cpu, data->cpumask)) 113 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
114 continue; 114 continue;
115 115
116 data->csd.func(data->csd.info); 116 data->csd.func(data->csd.info);
117 117
118 spin_lock(&data->lock); 118 spin_lock(&data->lock);
119 cpu_clear(cpu, data->cpumask); 119 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 120 WARN_ON(data->refs == 0);
121 data->refs--; 121 data->refs--;
122 refs = data->refs; 122 refs = data->refs;
@@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
223 local_irq_save(flags); 223 local_irq_save(flags);
224 func(info); 224 func(info);
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { 226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 227 struct call_single_data *data = NULL;
228 228
229 if (!wait) { 229 if (!wait) {
@@ -266,51 +266,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
266 generic_exec_single(cpu, data); 266 generic_exec_single(cpu, data);
267} 267}
268 268
269/* Dummy function */ 269/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
270static void quiesce_dummy(void *unused) 270#ifndef arch_send_call_function_ipi_mask
271{ 271#define arch_send_call_function_ipi_mask(maskp) \
272} 272 arch_send_call_function_ipi(*(maskp))
273 273#endif
274/*
275 * Ensure stack based data used in call function mask is safe to free.
276 *
277 * This is needed by smp_call_function_mask when using on-stack data, because
278 * a single call function queue is shared by all CPUs, and any CPU may pick up
279 * the data item on the queue at any time before it is deleted. So we need to
280 * ensure that all CPUs have transitioned through a quiescent state after
281 * this call.
282 *
283 * This is a very slow function, implemented by sending synchronous IPIs to
284 * all possible CPUs. For this reason, we have to alloc data rather than use
285 * stack based data even in the case of synchronous calls. The stack based
286 * data is then just used for deadlock/oom fallback which will be very rare.
287 *
288 * If a faster scheme can be made, we could go back to preferring stack based
289 * data -- the data allocation/free is non-zero cost.
290 */
291static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
292{
293 struct call_single_data data;
294 int cpu;
295
296 data.func = quiesce_dummy;
297 data.info = NULL;
298
299 for_each_cpu_mask(cpu, mask) {
300 data.flags = CSD_FLAG_WAIT;
301 generic_exec_single(cpu, &data);
302 }
303}
304 274
305/** 275/**
306 * smp_call_function_mask(): Run a function on a set of other CPUs. 276 * smp_call_function_many(): Run a function on a set of other CPUs.
307 * @mask: The set of cpus to run on. 277 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 278 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 279 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 280 * @wait: If true, wait (atomically) until function has completed on other CPUs.
311 * 281 *
312 * Returns 0 on success, else a negative status code.
313 *
314 * If @wait is true, then returns once @func has returned. Note that @wait 282 * If @wait is true, then returns once @func has returned. Note that @wait
315 * will be implicitly turned on in case of allocation failures, since 283 * will be implicitly turned on in case of allocation failures, since
316 * we fall back to on-stack allocation. 284 * we fall back to on-stack allocation.
@@ -319,53 +287,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
319 * hardware interrupt handler or from a bottom half handler. Preemption 287 * hardware interrupt handler or from a bottom half handler. Preemption
320 * must be disabled when calling this function. 288 * must be disabled when calling this function.
321 */ 289 */
322int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, 290void smp_call_function_many(const struct cpumask *mask,
323 int wait) 291 void (*func)(void *), void *info,
292 bool wait)
324{ 293{
325 struct call_function_data d; 294 struct call_function_data *data;
326 struct call_function_data *data = NULL;
327 cpumask_t allbutself;
328 unsigned long flags; 295 unsigned long flags;
329 int cpu, num_cpus; 296 int cpu, next_cpu;
330 int slowpath = 0;
331 297
332 /* Can deadlock when called with interrupts disabled */ 298 /* Can deadlock when called with interrupts disabled */
333 WARN_ON(irqs_disabled()); 299 WARN_ON(irqs_disabled());
334 300
335 cpu = smp_processor_id(); 301 /* So, what's a CPU they want? Ignoring this one. */
336 allbutself = cpu_online_map; 302 cpu = cpumask_first_and(mask, cpu_online_mask);
337 cpu_clear(cpu, allbutself); 303 if (cpu == smp_processor_id())
338 cpus_and(mask, mask, allbutself); 304 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
339 num_cpus = cpus_weight(mask); 305 /* No online cpus? We're done. */
340 306 if (cpu >= nr_cpu_ids)
341 /* 307 return;
342 * If zero CPUs, return. If just a single CPU, turn this request 308
343 * into a targetted single call instead since it's faster. 309 /* Do we have another CPU which isn't us? */
344 */ 310 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
345 if (!num_cpus) 311 if (next_cpu == smp_processor_id())
346 return 0; 312 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
347 else if (num_cpus == 1) { 313
348 cpu = first_cpu(mask); 314 /* Fastpath: do that cpu by itself. */
349 return smp_call_function_single(cpu, func, info, wait); 315 if (next_cpu >= nr_cpu_ids) {
316 smp_call_function_single(cpu, func, info, wait);
317 return;
350 } 318 }
351 319
352 data = kmalloc(sizeof(*data), GFP_ATOMIC); 320 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
353 if (data) { 321 if (unlikely(!data)) {
354 data->csd.flags = CSD_FLAG_ALLOC; 322 /* Slow path. */
355 if (wait) 323 for_each_online_cpu(cpu) {
356 data->csd.flags |= CSD_FLAG_WAIT; 324 if (cpu == smp_processor_id())
357 } else { 325 continue;
358 data = &d; 326 if (cpumask_test_cpu(cpu, mask))
359 data->csd.flags = CSD_FLAG_WAIT; 327 smp_call_function_single(cpu, func, info, wait);
360 wait = 1; 328 }
361 slowpath = 1; 329 return;
362 } 330 }
363 331
364 spin_lock_init(&data->lock); 332 spin_lock_init(&data->lock);
333 data->csd.flags = CSD_FLAG_ALLOC;
334 if (wait)
335 data->csd.flags |= CSD_FLAG_WAIT;
365 data->csd.func = func; 336 data->csd.func = func;
366 data->csd.info = info; 337 data->csd.info = info;
367 data->refs = num_cpus; 338 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
368 data->cpumask = mask; 339 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
340 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
369 341
370 spin_lock_irqsave(&call_function_lock, flags); 342 spin_lock_irqsave(&call_function_lock, flags);
371 list_add_tail_rcu(&data->csd.list, &call_function_queue); 343 list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +349,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
377 smp_mb(); 349 smp_mb();
378 350
379 /* Send a message to all CPUs in the map */ 351 /* Send a message to all CPUs in the map */
380 arch_send_call_function_ipi(mask); 352 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
381 353
382 /* optionally wait for the CPUs to complete */ 354 /* optionally wait for the CPUs to complete */
383 if (wait) { 355 if (wait)
384 csd_flag_wait(&data->csd); 356 csd_flag_wait(&data->csd);
385 if (unlikely(slowpath))
386 smp_call_function_mask_quiesce_stack(mask);
387 }
388
389 return 0;
390} 357}
391EXPORT_SYMBOL(smp_call_function_mask); 358EXPORT_SYMBOL(smp_call_function_many);
392 359
393/** 360/**
394 * smp_call_function(): Run a function on all other CPUs. 361 * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
396 * @info: An arbitrary pointer to pass to the function. 363 * @info: An arbitrary pointer to pass to the function.
397 * @wait: If true, wait (atomically) until function has completed on other CPUs. 364 * @wait: If true, wait (atomically) until function has completed on other CPUs.
398 * 365 *
399 * Returns 0 on success, else a negative status code. 366 * Returns 0.
400 * 367 *
401 * If @wait is true, then returns once @func has returned; otherwise 368 * If @wait is true, then returns once @func has returned; otherwise
402 * it returns just before the target cpu calls @func. In case of allocation 369 * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +374,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
407 */ 374 */
408int smp_call_function(void (*func)(void *), void *info, int wait) 375int smp_call_function(void (*func)(void *), void *info, int wait)
409{ 376{
410 int ret;
411
412 preempt_disable(); 377 preempt_disable();
413 ret = smp_call_function_mask(cpu_online_map, func, info, wait); 378 smp_call_function_many(cpu_online_mask, func, info, wait);
414 preempt_enable(); 379 preempt_enable();
415 return ret; 380 return 0;
416} 381}
417EXPORT_SYMBOL(smp_call_function); 382EXPORT_SYMBOL(smp_call_function);
418 383
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..b7568d7def23 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
@@ -746,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
746 break; 733 break;
747 /* Unbind so it can run. Fall thru. */ 734 /* Unbind so it can run. Fall thru. */
748 kthread_bind(per_cpu(ksoftirqd, hotcpu), 735 kthread_bind(per_cpu(ksoftirqd, hotcpu),
749 any_online_cpu(cpu_online_map)); 736 cpumask_any(cpu_online_mask));
750 case CPU_DEAD: 737 case CPU_DEAD:
751 case CPU_DEAD_FROZEN: { 738 case CPU_DEAD_FROZEN: {
752 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 1ab790c67b17..d9188c66278a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -303,17 +303,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 303 break;
304 case CPU_ONLINE: 304 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 305 case CPU_ONLINE_FROZEN:
306 check_cpu = any_online_cpu(cpu_online_map); 306 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 307 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 308 break;
309#ifdef CONFIG_HOTPLUG_CPU 309#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE: 310 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN: 311 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) { 312 if (hotcpu == check_cpu) {
313 cpumask_t temp_cpu_online_map = cpu_online_map; 313 /* Pick any other online cpu. */
314 314 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 cpu_clear(hotcpu, temp_cpu_online_map);
316 check_cpu = any_online_cpu(temp_cpu_online_map);
317 } 315 }
318 break; 316 break;
319 317
@@ -323,7 +321,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
323 break; 321 break;
324 /* Unbind so it can run. Fall thru. */ 322 /* Unbind so it can run. Fall thru. */
325 kthread_bind(per_cpu(watchdog_task, hotcpu), 323 kthread_bind(per_cpu(watchdog_task, hotcpu),
326 any_online_cpu(cpu_online_map)); 324 cpumask_any(cpu_online_mask));
327 case CPU_DEAD: 325 case CPU_DEAD:
328 case CPU_DEAD_FROZEN: 326 case CPU_DEAD_FROZEN:
329 p = per_cpu(watchdog_task, hotcpu); 327 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc388..286c41722e8c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused)
69 int err; 69 int err;
70 70
71 if (!active_cpus) { 71 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map)) 72 if (cpu == cpumask_first(cpu_online_mask))
73 smdata = &active; 73 smdata = &active;
74 } else { 74 } else {
75 if (cpu_isset(cpu, *active_cpus)) 75 if (cpumask_test_cpu(cpu, active_cpus))
76 smdata = &active; 76 smdata = &active;
77 } 77 }
78 /* Simple state machine */ 78 /* Simple state machine */
@@ -109,7 +109,7 @@ static int chill(void *unused)
109 return 0; 109 return 0;
110} 110}
111 111
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 112int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
113{ 113{
114 struct work_struct *sm_work; 114 struct work_struct *sm_work;
115 int i, ret; 115 int i, ret;
@@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
142 return ret; 142 return ret;
143} 143}
144 144
145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 145int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
146{ 146{
147 int ret; 147 int ret;
148 148
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fc3a0cfb994..d356d79e84ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,12 +112,17 @@ EXPORT_SYMBOL(cad_pid);
112 112
113void (*pm_power_off_prepare)(void); 113void (*pm_power_off_prepare)(void);
114 114
115/*
116 * set the priority of a task
117 * - the caller must hold the RCU read lock
118 */
115static int set_one_prio(struct task_struct *p, int niceval, int error) 119static int set_one_prio(struct task_struct *p, int niceval, int error)
116{ 120{
121 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
117 int no_nice; 122 int no_nice;
118 123
119 if (p->uid != current->euid && 124 if (pcred->uid != cred->euid &&
120 p->euid != current->euid && !capable(CAP_SYS_NICE)) { 125 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
121 error = -EPERM; 126 error = -EPERM;
122 goto out; 127 goto out;
123 } 128 }
@@ -141,6 +146,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
141{ 146{
142 struct task_struct *g, *p; 147 struct task_struct *g, *p;
143 struct user_struct *user; 148 struct user_struct *user;
149 const struct cred *cred = current_cred();
144 int error = -EINVAL; 150 int error = -EINVAL;
145 struct pid *pgrp; 151 struct pid *pgrp;
146 152
@@ -174,18 +180,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 180 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 181 break;
176 case PRIO_USER: 182 case PRIO_USER:
177 user = current->user; 183 user = (struct user_struct *) cred->user;
178 if (!who) 184 if (!who)
179 who = current->uid; 185 who = cred->uid;
180 else 186 else if ((who != cred->uid) &&
181 if ((who != current->uid) && !(user = find_user(who))) 187 !(user = find_user(who)))
182 goto out_unlock; /* No processes for this user */ 188 goto out_unlock; /* No processes for this user */
183 189
184 do_each_thread(g, p) 190 do_each_thread(g, p)
185 if (p->uid == who) 191 if (__task_cred(p)->uid == who)
186 error = set_one_prio(p, niceval, error); 192 error = set_one_prio(p, niceval, error);
187 while_each_thread(g, p); 193 while_each_thread(g, p);
188 if (who != current->uid) 194 if (who != cred->uid)
189 free_uid(user); /* For find_user() */ 195 free_uid(user); /* For find_user() */
190 break; 196 break;
191 } 197 }
@@ -205,6 +211,7 @@ asmlinkage long sys_getpriority(int which, int who)
205{ 211{
206 struct task_struct *g, *p; 212 struct task_struct *g, *p;
207 struct user_struct *user; 213 struct user_struct *user;
214 const struct cred *cred = current_cred();
208 long niceval, retval = -ESRCH; 215 long niceval, retval = -ESRCH;
209 struct pid *pgrp; 216 struct pid *pgrp;
210 217
@@ -236,21 +243,21 @@ asmlinkage long sys_getpriority(int which, int who)
236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 243 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 244 break;
238 case PRIO_USER: 245 case PRIO_USER:
239 user = current->user; 246 user = (struct user_struct *) cred->user;
240 if (!who) 247 if (!who)
241 who = current->uid; 248 who = cred->uid;
242 else 249 else if ((who != cred->uid) &&
243 if ((who != current->uid) && !(user = find_user(who))) 250 !(user = find_user(who)))
244 goto out_unlock; /* No processes for this user */ 251 goto out_unlock; /* No processes for this user */
245 252
246 do_each_thread(g, p) 253 do_each_thread(g, p)
247 if (p->uid == who) { 254 if (__task_cred(p)->uid == who) {
248 niceval = 20 - task_nice(p); 255 niceval = 20 - task_nice(p);
249 if (niceval > retval) 256 if (niceval > retval)
250 retval = niceval; 257 retval = niceval;
251 } 258 }
252 while_each_thread(g, p); 259 while_each_thread(g, p);
253 if (who != current->uid) 260 if (who != cred->uid)
254 free_uid(user); /* for find_user() */ 261 free_uid(user); /* for find_user() */
255 break; 262 break;
256 } 263 }
@@ -472,46 +479,48 @@ void ctrl_alt_del(void)
472 */ 479 */
473asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 480asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
474{ 481{
475 int old_rgid = current->gid; 482 const struct cred *old;
476 int old_egid = current->egid; 483 struct cred *new;
477 int new_rgid = old_rgid;
478 int new_egid = old_egid;
479 int retval; 484 int retval;
480 485
486 new = prepare_creds();
487 if (!new)
488 return -ENOMEM;
489 old = current_cred();
490
481 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); 491 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
482 if (retval) 492 if (retval)
483 return retval; 493 goto error;
484 494
495 retval = -EPERM;
485 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
486 if ((old_rgid == rgid) || 497 if (old->gid == rgid ||
487 (current->egid==rgid) || 498 old->egid == rgid ||
488 capable(CAP_SETGID)) 499 capable(CAP_SETGID))
489 new_rgid = rgid; 500 new->gid = rgid;
490 else 501 else
491 return -EPERM; 502 goto error;
492 } 503 }
493 if (egid != (gid_t) -1) { 504 if (egid != (gid_t) -1) {
494 if ((old_rgid == egid) || 505 if (old->gid == egid ||
495 (current->egid == egid) || 506 old->egid == egid ||
496 (current->sgid == egid) || 507 old->sgid == egid ||
497 capable(CAP_SETGID)) 508 capable(CAP_SETGID))
498 new_egid = egid; 509 new->egid = egid;
499 else 510 else
500 return -EPERM; 511 goto error;
501 }
502 if (new_egid != old_egid) {
503 set_dumpable(current->mm, suid_dumpable);
504 smp_wmb();
505 } 512 }
513
506 if (rgid != (gid_t) -1 || 514 if (rgid != (gid_t) -1 ||
507 (egid != (gid_t) -1 && egid != old_rgid)) 515 (egid != (gid_t) -1 && egid != old->gid))
508 current->sgid = new_egid; 516 new->sgid = new->egid;
509 current->fsgid = new_egid; 517 new->fsgid = new->egid;
510 current->egid = new_egid; 518
511 current->gid = new_rgid; 519 return commit_creds(new);
512 key_fsgid_changed(current); 520
513 proc_id_connector(current, PROC_EVENT_GID); 521error:
514 return 0; 522 abort_creds(new);
523 return retval;
515} 524}
516 525
517/* 526/*
@@ -521,56 +530,54 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
521 */ 530 */
522asmlinkage long sys_setgid(gid_t gid) 531asmlinkage long sys_setgid(gid_t gid)
523{ 532{
524 int old_egid = current->egid; 533 const struct cred *old;
534 struct cred *new;
525 int retval; 535 int retval;
526 536
537 new = prepare_creds();
538 if (!new)
539 return -ENOMEM;
540 old = current_cred();
541
527 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); 542 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
528 if (retval) 543 if (retval)
529 return retval; 544 goto error;
530 545
531 if (capable(CAP_SETGID)) { 546 retval = -EPERM;
532 if (old_egid != gid) { 547 if (capable(CAP_SETGID))
533 set_dumpable(current->mm, suid_dumpable); 548 new->gid = new->egid = new->sgid = new->fsgid = gid;
534 smp_wmb(); 549 else if (gid == old->gid || gid == old->sgid)
535 } 550 new->egid = new->fsgid = gid;
536 current->gid = current->egid = current->sgid = current->fsgid = gid;
537 } else if ((gid == current->gid) || (gid == current->sgid)) {
538 if (old_egid != gid) {
539 set_dumpable(current->mm, suid_dumpable);
540 smp_wmb();
541 }
542 current->egid = current->fsgid = gid;
543 }
544 else 551 else
545 return -EPERM; 552 goto error;
546 553
547 key_fsgid_changed(current); 554 return commit_creds(new);
548 proc_id_connector(current, PROC_EVENT_GID); 555
549 return 0; 556error:
557 abort_creds(new);
558 return retval;
550} 559}
551 560
552static int set_user(uid_t new_ruid, int dumpclear) 561/*
562 * change the user struct in a credentials set to match the new UID
563 */
564static int set_user(struct cred *new)
553{ 565{
554 struct user_struct *new_user; 566 struct user_struct *new_user;
555 567
556 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); 568 new_user = alloc_uid(current_user_ns(), new->uid);
557 if (!new_user) 569 if (!new_user)
558 return -EAGAIN; 570 return -EAGAIN;
559 571
560 if (atomic_read(&new_user->processes) >= 572 if (atomic_read(&new_user->processes) >=
561 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 573 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
562 new_user != current->nsproxy->user_ns->root_user) { 574 new_user != INIT_USER) {
563 free_uid(new_user); 575 free_uid(new_user);
564 return -EAGAIN; 576 return -EAGAIN;
565 } 577 }
566 578
567 switch_uid(new_user); 579 free_uid(new->user);
568 580 new->user = new_user;
569 if (dumpclear) {
570 set_dumpable(current->mm, suid_dumpable);
571 smp_wmb();
572 }
573 current->uid = new_ruid;
574 return 0; 581 return 0;
575} 582}
576 583
@@ -591,54 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
591 */ 598 */
592asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 599asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
593{ 600{
594 int old_ruid, old_euid, old_suid, new_ruid, new_euid; 601 const struct cred *old;
602 struct cred *new;
595 int retval; 603 int retval;
596 604
605 new = prepare_creds();
606 if (!new)
607 return -ENOMEM;
608 old = current_cred();
609
597 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); 610 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
598 if (retval) 611 if (retval)
599 return retval; 612 goto error;
600
601 new_ruid = old_ruid = current->uid;
602 new_euid = old_euid = current->euid;
603 old_suid = current->suid;
604 613
614 retval = -EPERM;
605 if (ruid != (uid_t) -1) { 615 if (ruid != (uid_t) -1) {
606 new_ruid = ruid; 616 new->uid = ruid;
607 if ((old_ruid != ruid) && 617 if (old->uid != ruid &&
608 (current->euid != ruid) && 618 old->euid != ruid &&
609 !capable(CAP_SETUID)) 619 !capable(CAP_SETUID))
610 return -EPERM; 620 goto error;
611 } 621 }
612 622
613 if (euid != (uid_t) -1) { 623 if (euid != (uid_t) -1) {
614 new_euid = euid; 624 new->euid = euid;
615 if ((old_ruid != euid) && 625 if (old->uid != euid &&
616 (current->euid != euid) && 626 old->euid != euid &&
617 (current->suid != euid) && 627 old->suid != euid &&
618 !capable(CAP_SETUID)) 628 !capable(CAP_SETUID))
619 return -EPERM; 629 goto error;
620 } 630 }
621 631
622 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 632 retval = -EAGAIN;
623 return -EAGAIN; 633 if (new->uid != old->uid && set_user(new) < 0)
634 goto error;
624 635
625 if (new_euid != old_euid) {
626 set_dumpable(current->mm, suid_dumpable);
627 smp_wmb();
628 }
629 current->fsuid = current->euid = new_euid;
630 if (ruid != (uid_t) -1 || 636 if (ruid != (uid_t) -1 ||
631 (euid != (uid_t) -1 && euid != old_ruid)) 637 (euid != (uid_t) -1 && euid != old->uid))
632 current->suid = current->euid; 638 new->suid = new->euid;
633 current->fsuid = current->euid; 639 new->fsuid = new->euid;
634 640
635 key_fsuid_changed(current); 641 retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
636 proc_id_connector(current, PROC_EVENT_UID); 642 if (retval < 0)
637 643 goto error;
638 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
639}
640 644
645 return commit_creds(new);
641 646
647error:
648 abort_creds(new);
649 return retval;
650}
642 651
643/* 652/*
644 * setuid() is implemented like SysV with SAVED_IDS 653 * setuid() is implemented like SysV with SAVED_IDS
@@ -653,36 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 */ 662 */
654asmlinkage long sys_setuid(uid_t uid) 663asmlinkage long sys_setuid(uid_t uid)
655{ 664{
656 int old_euid = current->euid; 665 const struct cred *old;
657 int old_ruid, old_suid, new_suid; 666 struct cred *new;
658 int retval; 667 int retval;
659 668
669 new = prepare_creds();
670 if (!new)
671 return -ENOMEM;
672 old = current_cred();
673
660 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 674 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
661 if (retval) 675 if (retval)
662 return retval; 676 goto error;
663 677
664 old_ruid = current->uid; 678 retval = -EPERM;
665 old_suid = current->suid;
666 new_suid = old_suid;
667
668 if (capable(CAP_SETUID)) { 679 if (capable(CAP_SETUID)) {
669 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) 680 new->suid = new->uid = uid;
670 return -EAGAIN; 681 if (uid != old->uid && set_user(new) < 0) {
671 new_suid = uid; 682 retval = -EAGAIN;
672 } else if ((uid != current->uid) && (uid != new_suid)) 683 goto error;
673 return -EPERM; 684 }
674 685 } else if (uid != old->uid && uid != new->suid) {
675 if (old_euid != uid) { 686 goto error;
676 set_dumpable(current->mm, suid_dumpable);
677 smp_wmb();
678 } 687 }
679 current->fsuid = current->euid = uid;
680 current->suid = new_suid;
681 688
682 key_fsuid_changed(current); 689 new->fsuid = new->euid = uid;
683 proc_id_connector(current, PROC_EVENT_UID); 690
691 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
692 if (retval < 0)
693 goto error;
684 694
685 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 695 return commit_creds(new);
696
697error:
698 abort_creds(new);
699 return retval;
686} 700}
687 701
688 702
@@ -692,54 +706,63 @@ asmlinkage long sys_setuid(uid_t uid)
692 */ 706 */
693asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 707asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
694{ 708{
695 int old_ruid = current->uid; 709 const struct cred *old;
696 int old_euid = current->euid; 710 struct cred *new;
697 int old_suid = current->suid;
698 int retval; 711 int retval;
699 712
713 new = prepare_creds();
714 if (!new)
715 return -ENOMEM;
716
700 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); 717 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
701 if (retval) 718 if (retval)
702 return retval; 719 goto error;
720 old = current_cred();
703 721
722 retval = -EPERM;
704 if (!capable(CAP_SETUID)) { 723 if (!capable(CAP_SETUID)) {
705 if ((ruid != (uid_t) -1) && (ruid != current->uid) && 724 if (ruid != (uid_t) -1 && ruid != old->uid &&
706 (ruid != current->euid) && (ruid != current->suid)) 725 ruid != old->euid && ruid != old->suid)
707 return -EPERM; 726 goto error;
708 if ((euid != (uid_t) -1) && (euid != current->uid) && 727 if (euid != (uid_t) -1 && euid != old->uid &&
709 (euid != current->euid) && (euid != current->suid)) 728 euid != old->euid && euid != old->suid)
710 return -EPERM; 729 goto error;
711 if ((suid != (uid_t) -1) && (suid != current->uid) && 730 if (suid != (uid_t) -1 && suid != old->uid &&
712 (suid != current->euid) && (suid != current->suid)) 731 suid != old->euid && suid != old->suid)
713 return -EPERM; 732 goto error;
714 } 733 }
734
735 retval = -EAGAIN;
715 if (ruid != (uid_t) -1) { 736 if (ruid != (uid_t) -1) {
716 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) 737 new->uid = ruid;
717 return -EAGAIN; 738 if (ruid != old->uid && set_user(new) < 0)
739 goto error;
718 } 740 }
719 if (euid != (uid_t) -1) { 741 if (euid != (uid_t) -1)
720 if (euid != current->euid) { 742 new->euid = euid;
721 set_dumpable(current->mm, suid_dumpable);
722 smp_wmb();
723 }
724 current->euid = euid;
725 }
726 current->fsuid = current->euid;
727 if (suid != (uid_t) -1) 743 if (suid != (uid_t) -1)
728 current->suid = suid; 744 new->suid = suid;
745 new->fsuid = new->euid;
746
747 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
748 if (retval < 0)
749 goto error;
729 750
730 key_fsuid_changed(current); 751 return commit_creds(new);
731 proc_id_connector(current, PROC_EVENT_UID);
732 752
733 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 753error:
754 abort_creds(new);
755 return retval;
734} 756}
735 757
736asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 758asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
737{ 759{
760 const struct cred *cred = current_cred();
738 int retval; 761 int retval;
739 762
740 if (!(retval = put_user(current->uid, ruid)) && 763 if (!(retval = put_user(cred->uid, ruid)) &&
741 !(retval = put_user(current->euid, euid))) 764 !(retval = put_user(cred->euid, euid)))
742 retval = put_user(current->suid, suid); 765 retval = put_user(cred->suid, suid);
743 766
744 return retval; 767 return retval;
745} 768}
@@ -749,48 +772,55 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
749 */ 772 */
750asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 773asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
751{ 774{
775 const struct cred *old;
776 struct cred *new;
752 int retval; 777 int retval;
753 778
779 new = prepare_creds();
780 if (!new)
781 return -ENOMEM;
782 old = current_cred();
783
754 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); 784 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
755 if (retval) 785 if (retval)
756 return retval; 786 goto error;
757 787
788 retval = -EPERM;
758 if (!capable(CAP_SETGID)) { 789 if (!capable(CAP_SETGID)) {
759 if ((rgid != (gid_t) -1) && (rgid != current->gid) && 790 if (rgid != (gid_t) -1 && rgid != old->gid &&
760 (rgid != current->egid) && (rgid != current->sgid)) 791 rgid != old->egid && rgid != old->sgid)
761 return -EPERM; 792 goto error;
762 if ((egid != (gid_t) -1) && (egid != current->gid) && 793 if (egid != (gid_t) -1 && egid != old->gid &&
763 (egid != current->egid) && (egid != current->sgid)) 794 egid != old->egid && egid != old->sgid)
764 return -EPERM; 795 goto error;
765 if ((sgid != (gid_t) -1) && (sgid != current->gid) && 796 if (sgid != (gid_t) -1 && sgid != old->gid &&
766 (sgid != current->egid) && (sgid != current->sgid)) 797 sgid != old->egid && sgid != old->sgid)
767 return -EPERM; 798 goto error;
768 } 799 }
769 if (egid != (gid_t) -1) { 800
770 if (egid != current->egid) {
771 set_dumpable(current->mm, suid_dumpable);
772 smp_wmb();
773 }
774 current->egid = egid;
775 }
776 current->fsgid = current->egid;
777 if (rgid != (gid_t) -1) 801 if (rgid != (gid_t) -1)
778 current->gid = rgid; 802 new->gid = rgid;
803 if (egid != (gid_t) -1)
804 new->egid = egid;
779 if (sgid != (gid_t) -1) 805 if (sgid != (gid_t) -1)
780 current->sgid = sgid; 806 new->sgid = sgid;
807 new->fsgid = new->egid;
781 808
782 key_fsgid_changed(current); 809 return commit_creds(new);
783 proc_id_connector(current, PROC_EVENT_GID); 810
784 return 0; 811error:
812 abort_creds(new);
813 return retval;
785} 814}
786 815
787asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 816asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
788{ 817{
818 const struct cred *cred = current_cred();
789 int retval; 819 int retval;
790 820
791 if (!(retval = put_user(current->gid, rgid)) && 821 if (!(retval = put_user(cred->gid, rgid)) &&
792 !(retval = put_user(current->egid, egid))) 822 !(retval = put_user(cred->egid, egid)))
793 retval = put_user(current->sgid, sgid); 823 retval = put_user(cred->sgid, sgid);
794 824
795 return retval; 825 return retval;
796} 826}
@@ -804,27 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
804 */ 834 */
805asmlinkage long sys_setfsuid(uid_t uid) 835asmlinkage long sys_setfsuid(uid_t uid)
806{ 836{
807 int old_fsuid; 837 const struct cred *old;
838 struct cred *new;
839 uid_t old_fsuid;
808 840
809 old_fsuid = current->fsuid; 841 new = prepare_creds();
810 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) 842 if (!new)
811 return old_fsuid; 843 return current_fsuid();
844 old = current_cred();
845 old_fsuid = old->fsuid;
812 846
813 if (uid == current->uid || uid == current->euid || 847 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
814 uid == current->suid || uid == current->fsuid || 848 goto error;
849
850 if (uid == old->uid || uid == old->euid ||
851 uid == old->suid || uid == old->fsuid ||
815 capable(CAP_SETUID)) { 852 capable(CAP_SETUID)) {
816 if (uid != old_fsuid) { 853 if (uid != old_fsuid) {
817 set_dumpable(current->mm, suid_dumpable); 854 new->fsuid = uid;
818 smp_wmb(); 855 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
856 goto change_okay;
819 } 857 }
820 current->fsuid = uid;
821 } 858 }
822 859
823 key_fsuid_changed(current); 860error:
824 proc_id_connector(current, PROC_EVENT_UID); 861 abort_creds(new);
825 862 return old_fsuid;
826 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
827 863
864change_okay:
865 commit_creds(new);
828 return old_fsuid; 866 return old_fsuid;
829} 867}
830 868
@@ -833,23 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
833 */ 871 */
834asmlinkage long sys_setfsgid(gid_t gid) 872asmlinkage long sys_setfsgid(gid_t gid)
835{ 873{
836 int old_fsgid; 874 const struct cred *old;
875 struct cred *new;
876 gid_t old_fsgid;
877
878 new = prepare_creds();
879 if (!new)
880 return current_fsgid();
881 old = current_cred();
882 old_fsgid = old->fsgid;
837 883
838 old_fsgid = current->fsgid;
839 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) 884 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
840 return old_fsgid; 885 goto error;
841 886
842 if (gid == current->gid || gid == current->egid || 887 if (gid == old->gid || gid == old->egid ||
843 gid == current->sgid || gid == current->fsgid || 888 gid == old->sgid || gid == old->fsgid ||
844 capable(CAP_SETGID)) { 889 capable(CAP_SETGID)) {
845 if (gid != old_fsgid) { 890 if (gid != old_fsgid) {
846 set_dumpable(current->mm, suid_dumpable); 891 new->fsgid = gid;
847 smp_wmb(); 892 goto change_okay;
848 } 893 }
849 current->fsgid = gid;
850 key_fsgid_changed(current);
851 proc_id_connector(current, PROC_EVENT_GID);
852 } 894 }
895
896error:
897 abort_creds(new);
898 return old_fsgid;
899
900change_okay:
901 commit_creds(new);
853 return old_fsgid; 902 return old_fsgid;
854} 903}
855 904
@@ -1118,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
1118 1167
1119/* export the group_info to a user-space array */ 1168/* export the group_info to a user-space array */
1120static int groups_to_user(gid_t __user *grouplist, 1169static int groups_to_user(gid_t __user *grouplist,
1121 struct group_info *group_info) 1170 const struct group_info *group_info)
1122{ 1171{
1123 int i; 1172 int i;
1124 unsigned int count = group_info->ngroups; 1173 unsigned int count = group_info->ngroups;
@@ -1186,7 +1235,7 @@ static void groups_sort(struct group_info *group_info)
1186} 1235}
1187 1236
1188/* a simple bsearch */ 1237/* a simple bsearch */
1189int groups_search(struct group_info *group_info, gid_t grp) 1238int groups_search(const struct group_info *group_info, gid_t grp)
1190{ 1239{
1191 unsigned int left, right; 1240 unsigned int left, right;
1192 1241
@@ -1208,51 +1257,74 @@ int groups_search(struct group_info *group_info, gid_t grp)
1208 return 0; 1257 return 0;
1209} 1258}
1210 1259
1211/* validate and set current->group_info */ 1260/**
1212int set_current_groups(struct group_info *group_info) 1261 * set_groups - Change a group subscription in a set of credentials
1262 * @new: The newly prepared set of credentials to alter
1263 * @group_info: The group list to install
1264 *
1265 * Validate a group subscription and, if valid, insert it into a set
1266 * of credentials.
1267 */
1268int set_groups(struct cred *new, struct group_info *group_info)
1213{ 1269{
1214 int retval; 1270 int retval;
1215 struct group_info *old_info;
1216 1271
1217 retval = security_task_setgroups(group_info); 1272 retval = security_task_setgroups(group_info);
1218 if (retval) 1273 if (retval)
1219 return retval; 1274 return retval;
1220 1275
1276 put_group_info(new->group_info);
1221 groups_sort(group_info); 1277 groups_sort(group_info);
1222 get_group_info(group_info); 1278 get_group_info(group_info);
1279 new->group_info = group_info;
1280 return 0;
1281}
1282
1283EXPORT_SYMBOL(set_groups);
1223 1284
1224 task_lock(current); 1285/**
1225 old_info = current->group_info; 1286 * set_current_groups - Change current's group subscription
1226 current->group_info = group_info; 1287 * @group_info: The group list to impose
1227 task_unlock(current); 1288 *
1289 * Validate a group subscription and, if valid, impose it upon current's task
1290 * security record.
1291 */
1292int set_current_groups(struct group_info *group_info)
1293{
1294 struct cred *new;
1295 int ret;
1228 1296
1229 put_group_info(old_info); 1297 new = prepare_creds();
1298 if (!new)
1299 return -ENOMEM;
1230 1300
1231 return 0; 1301 ret = set_groups(new, group_info);
1302 if (ret < 0) {
1303 abort_creds(new);
1304 return ret;
1305 }
1306
1307 return commit_creds(new);
1232} 1308}
1233 1309
1234EXPORT_SYMBOL(set_current_groups); 1310EXPORT_SYMBOL(set_current_groups);
1235 1311
1236asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1312asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1237{ 1313{
1238 int i = 0; 1314 const struct cred *cred = current_cred();
1239 1315 int i;
1240 /*
1241 * SMP: Nobody else can change our grouplist. Thus we are
1242 * safe.
1243 */
1244 1316
1245 if (gidsetsize < 0) 1317 if (gidsetsize < 0)
1246 return -EINVAL; 1318 return -EINVAL;
1247 1319
1248 /* no need to grab task_lock here; it cannot change */ 1320 /* no need to grab task_lock here; it cannot change */
1249 i = current->group_info->ngroups; 1321 i = cred->group_info->ngroups;
1250 if (gidsetsize) { 1322 if (gidsetsize) {
1251 if (i > gidsetsize) { 1323 if (i > gidsetsize) {
1252 i = -EINVAL; 1324 i = -EINVAL;
1253 goto out; 1325 goto out;
1254 } 1326 }
1255 if (groups_to_user(grouplist, current->group_info)) { 1327 if (groups_to_user(grouplist, cred->group_info)) {
1256 i = -EFAULT; 1328 i = -EFAULT;
1257 goto out; 1329 goto out;
1258 } 1330 }
@@ -1296,9 +1368,11 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1296 */ 1368 */
1297int in_group_p(gid_t grp) 1369int in_group_p(gid_t grp)
1298{ 1370{
1371 const struct cred *cred = current_cred();
1299 int retval = 1; 1372 int retval = 1;
1300 if (grp != current->fsgid) 1373
1301 retval = groups_search(current->group_info, grp); 1374 if (grp != cred->fsgid)
1375 retval = groups_search(cred->group_info, grp);
1302 return retval; 1376 return retval;
1303} 1377}
1304 1378
@@ -1306,9 +1380,11 @@ EXPORT_SYMBOL(in_group_p);
1306 1380
1307int in_egroup_p(gid_t grp) 1381int in_egroup_p(gid_t grp)
1308{ 1382{
1383 const struct cred *cred = current_cred();
1309 int retval = 1; 1384 int retval = 1;
1310 if (grp != current->egid) 1385
1311 retval = groups_search(current->group_info, grp); 1386 if (grp != cred->egid)
1387 retval = groups_search(cred->group_info, grp);
1312 return retval; 1388 return retval;
1313} 1389}
1314 1390
@@ -1624,50 +1700,56 @@ asmlinkage long sys_umask(int mask)
1624asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1700asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1625 unsigned long arg4, unsigned long arg5) 1701 unsigned long arg4, unsigned long arg5)
1626{ 1702{
1627 long error = 0; 1703 struct task_struct *me = current;
1704 unsigned char comm[sizeof(me->comm)];
1705 long error;
1628 1706
1629 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) 1707 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1708 if (error != -ENOSYS)
1630 return error; 1709 return error;
1631 1710
1711 error = 0;
1632 switch (option) { 1712 switch (option) {
1633 case PR_SET_PDEATHSIG: 1713 case PR_SET_PDEATHSIG:
1634 if (!valid_signal(arg2)) { 1714 if (!valid_signal(arg2)) {
1635 error = -EINVAL; 1715 error = -EINVAL;
1636 break; 1716 break;
1637 } 1717 }
1638 current->pdeath_signal = arg2; 1718 me->pdeath_signal = arg2;
1719 error = 0;
1639 break; 1720 break;
1640 case PR_GET_PDEATHSIG: 1721 case PR_GET_PDEATHSIG:
1641 error = put_user(current->pdeath_signal, (int __user *)arg2); 1722 error = put_user(me->pdeath_signal, (int __user *)arg2);
1642 break; 1723 break;
1643 case PR_GET_DUMPABLE: 1724 case PR_GET_DUMPABLE:
1644 error = get_dumpable(current->mm); 1725 error = get_dumpable(me->mm);
1645 break; 1726 break;
1646 case PR_SET_DUMPABLE: 1727 case PR_SET_DUMPABLE:
1647 if (arg2 < 0 || arg2 > 1) { 1728 if (arg2 < 0 || arg2 > 1) {
1648 error = -EINVAL; 1729 error = -EINVAL;
1649 break; 1730 break;
1650 } 1731 }
1651 set_dumpable(current->mm, arg2); 1732 set_dumpable(me->mm, arg2);
1733 error = 0;
1652 break; 1734 break;
1653 1735
1654 case PR_SET_UNALIGN: 1736 case PR_SET_UNALIGN:
1655 error = SET_UNALIGN_CTL(current, arg2); 1737 error = SET_UNALIGN_CTL(me, arg2);
1656 break; 1738 break;
1657 case PR_GET_UNALIGN: 1739 case PR_GET_UNALIGN:
1658 error = GET_UNALIGN_CTL(current, arg2); 1740 error = GET_UNALIGN_CTL(me, arg2);
1659 break; 1741 break;
1660 case PR_SET_FPEMU: 1742 case PR_SET_FPEMU:
1661 error = SET_FPEMU_CTL(current, arg2); 1743 error = SET_FPEMU_CTL(me, arg2);
1662 break; 1744 break;
1663 case PR_GET_FPEMU: 1745 case PR_GET_FPEMU:
1664 error = GET_FPEMU_CTL(current, arg2); 1746 error = GET_FPEMU_CTL(me, arg2);
1665 break; 1747 break;
1666 case PR_SET_FPEXC: 1748 case PR_SET_FPEXC:
1667 error = SET_FPEXC_CTL(current, arg2); 1749 error = SET_FPEXC_CTL(me, arg2);
1668 break; 1750 break;
1669 case PR_GET_FPEXC: 1751 case PR_GET_FPEXC:
1670 error = GET_FPEXC_CTL(current, arg2); 1752 error = GET_FPEXC_CTL(me, arg2);
1671 break; 1753 break;
1672 case PR_GET_TIMING: 1754 case PR_GET_TIMING:
1673 error = PR_TIMING_STATISTICAL; 1755 error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1757,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1675 case PR_SET_TIMING: 1757 case PR_SET_TIMING:
1676 if (arg2 != PR_TIMING_STATISTICAL) 1758 if (arg2 != PR_TIMING_STATISTICAL)
1677 error = -EINVAL; 1759 error = -EINVAL;
1760 else
1761 error = 0;
1678 break; 1762 break;
1679 1763
1680 case PR_SET_NAME: { 1764 case PR_SET_NAME:
1681 struct task_struct *me = current; 1765 comm[sizeof(me->comm)-1] = 0;
1682 unsigned char ncomm[sizeof(me->comm)]; 1766 if (strncpy_from_user(comm, (char __user *)arg2,
1683 1767 sizeof(me->comm) - 1) < 0)
1684 ncomm[sizeof(me->comm)-1] = 0;
1685 if (strncpy_from_user(ncomm, (char __user *)arg2,
1686 sizeof(me->comm)-1) < 0)
1687 return -EFAULT; 1768 return -EFAULT;
1688 set_task_comm(me, ncomm); 1769 set_task_comm(me, comm);
1689 return 0; 1770 return 0;
1690 } 1771 case PR_GET_NAME:
1691 case PR_GET_NAME: { 1772 get_task_comm(comm, me);
1692 struct task_struct *me = current; 1773 if (copy_to_user((char __user *)arg2, comm,
1693 unsigned char tcomm[sizeof(me->comm)]; 1774 sizeof(comm)))
1694
1695 get_task_comm(tcomm, me);
1696 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1697 return -EFAULT; 1775 return -EFAULT;
1698 return 0; 1776 return 0;
1699 }
1700 case PR_GET_ENDIAN: 1777 case PR_GET_ENDIAN:
1701 error = GET_ENDIAN(current, arg2); 1778 error = GET_ENDIAN(me, arg2);
1702 break; 1779 break;
1703 case PR_SET_ENDIAN: 1780 case PR_SET_ENDIAN:
1704 error = SET_ENDIAN(current, arg2); 1781 error = SET_ENDIAN(me, arg2);
1705 break; 1782 break;
1706 1783
1707 case PR_GET_SECCOMP: 1784 case PR_GET_SECCOMP:
@@ -1725,6 +1802,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1725 current->default_timer_slack_ns; 1802 current->default_timer_slack_ns;
1726 else 1803 else
1727 current->timer_slack_ns = arg2; 1804 current->timer_slack_ns = arg2;
1805 error = 0;
1728 break; 1806 break;
1729 default: 1807 default:
1730 error = -EINVAL; 1808 error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c83f566e940a..ff6d45c7626f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,6 +121,10 @@ extern int sg_big_buff;
121#include <asm/system.h> 121#include <asm/system.h>
122#endif 122#endif
123 123
124#ifdef CONFIG_SPARC64
125extern int sysctl_tsb_ratio;
126#endif
127
124#ifdef __hppa__ 128#ifdef __hppa__
125extern int pwrsw_enabled; 129extern int pwrsw_enabled;
126extern int unaligned_enabled; 130extern int unaligned_enabled;
@@ -451,6 +455,16 @@ static struct ctl_table kern_table[] = {
451 .proc_handler = &proc_dointvec, 455 .proc_handler = &proc_dointvec,
452 }, 456 },
453#endif 457#endif
458#ifdef CONFIG_SPARC64
459 {
460 .ctl_name = CTL_UNNUMBERED,
461 .procname = "tsb-ratio",
462 .data = &sysctl_tsb_ratio,
463 .maxlen = sizeof (int),
464 .mode = 0644,
465 .proc_handler = &proc_dointvec,
466 },
467#endif
454#ifdef __hppa__ 468#ifdef __hppa__
455 { 469 {
456 .ctl_name = KERN_HPPA_PWRSW, 470 .ctl_name = KERN_HPPA_PWRSW,
@@ -487,6 +501,16 @@ static struct ctl_table kern_table[] = {
487 .proc_handler = &ftrace_enable_sysctl, 501 .proc_handler = &ftrace_enable_sysctl,
488 }, 502 },
489#endif 503#endif
504#ifdef CONFIG_STACK_TRACER
505 {
506 .ctl_name = CTL_UNNUMBERED,
507 .procname = "stack_tracer_enabled",
508 .data = &stack_tracer_enabled,
509 .maxlen = sizeof(int),
510 .mode = 0644,
511 .proc_handler = &stack_trace_sysctl,
512 },
513#endif
490#ifdef CONFIG_TRACING 514#ifdef CONFIG_TRACING
491 { 515 {
492 .ctl_name = CTL_UNNUMBERED, 516 .ctl_name = CTL_UNNUMBERED,
@@ -1661,7 +1685,7 @@ out:
1661 1685
1662static int test_perm(int mode, int op) 1686static int test_perm(int mode, int op)
1663{ 1687{
1664 if (!current->euid) 1688 if (!current_euid())
1665 mode >>= 6; 1689 mode >>= 6;
1666 else if (in_egroup_p(0)) 1690 else if (in_egroup_p(0))
1667 mode >>= 3; 1691 mode >>= 3;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23ab8fb..fafeb48f27c0 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
730}; 730};
731 731
732static const struct trans_ctl_table trans_fs_xfs_table[] = { 732static const struct trans_ctl_table trans_fs_xfs_table[] = {
733 { XFS_RESTRICT_CHOWN, "restrict_chown" },
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" }, 733 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" }, 734 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" }, 735 { XFS_PANIC_MASK, "panic_mask" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 6d7dc4ec4aa5..888adbcca30c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
290 return; 290 return;
291} 291}
292 292
293static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{ 294{
295 struct listener_list *listeners; 295 struct listener_list *listeners;
296 struct listener *s, *tmp; 296 struct listener *s, *tmp;
297 unsigned int cpu; 297 unsigned int cpu;
298 cpumask_t mask = *maskp;
299 298
300 if (!cpus_subset(mask, cpu_possible_map)) 299 if (!cpumask_subset(mask, cpu_possible_mask))
301 return -EINVAL; 300 return -EINVAL;
302 301
303 if (isadd == REGISTER) { 302 if (isadd == REGISTER) {
304 for_each_cpu_mask_nr(cpu, mask) { 303 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 305 cpu_to_node(cpu));
307 if (!s) 306 if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 319
321 /* Deregister or cleanup */ 320 /* Deregister or cleanup */
322cleanup: 321cleanup:
323 for_each_cpu_mask_nr(cpu, mask) { 322 for_each_cpu(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 323 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 324 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
335 return 0; 334 return 0;
336} 335}
337 336
338static int parse(struct nlattr *na, cpumask_t *mask) 337static int parse(struct nlattr *na, struct cpumask *mask)
339{ 338{
340 char *data; 339 char *data;
341 int len; 340 int len;
@@ -428,23 +427,33 @@ err:
428 427
429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430{ 429{
431 int rc = 0; 430 int rc;
432 struct sk_buff *rep_skb; 431 struct sk_buff *rep_skb;
433 struct taskstats *stats; 432 struct taskstats *stats;
434 size_t size; 433 size_t size;
435 cpumask_t mask; 434 cpumask_var_t mask;
435
436 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437 return -ENOMEM;
436 438
437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
438 if (rc < 0) 440 if (rc < 0)
439 return rc; 441 goto free_return_rc;
440 if (rc == 0) 442 if (rc == 0) {
441 return add_del_listener(info->snd_pid, &mask, REGISTER); 443 rc = add_del_listener(info->snd_pid, mask, REGISTER);
444 goto free_return_rc;
445 }
442 446
443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
444 if (rc < 0) 448 if (rc < 0)
449 goto free_return_rc;
450 if (rc == 0) {
451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453 free_cpumask_var(mask);
445 return rc; 454 return rc;
446 if (rc == 0) 455 }
447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 456 free_cpumask_var(mask);
448 457
449 /* 458 /*
450 * Size includes space for nested attributes 459 * Size includes space for nested attributes
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec97526..ca89e1593f08 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = cpumask_next(raw_smp_processor_id(),
149 cpu_online_mask);
149 150
150 if (next_cpu >= nr_cpu_ids) 151 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 152 next_cpu = cpumask_first(cpu_online_mask);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 153 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 154 add_timer_on(&watchdog_timer, next_cpu);
154 } 155 }
@@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
173 watchdog_last = watchdog->read(); 174 watchdog_last = watchdog->read();
174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 175 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
175 add_timer_on(&watchdog_timer, 176 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map)); 177 cpumask_first(cpu_online_mask));
177 } 178 }
178 } else { 179 } else {
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 180 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
195 watchdog_timer.expires = 196 watchdog_timer.expires =
196 jiffies + WATCHDOG_INTERVAL; 197 jiffies + WATCHDOG_INTERVAL;
197 add_timer_on(&watchdog_timer, 198 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map)); 199 cpumask_first(cpu_online_mask));
199 } 200 }
200 } 201 }
201 } 202 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486b..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
164 } 164 }
165 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
166 166
167 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
168 168
169 return res; 169 return res;
170} 170}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9590af2327be..118a3b3b3f9a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
32static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force; 35static int tick_broadcast_force;
34 36
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
46 return &tick_broadcast_device; 48 return &tick_broadcast_device;
47} 49}
48 50
49cpumask_t *tick_get_broadcast_mask(void) 51struct cpumask *tick_get_broadcast_mask(void)
50{ 52{
51 return &tick_broadcast_mask; 53 return to_cpumask(tick_broadcast_mask);
52} 54}
53 55
54/* 56/*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
72 74
73 clockevents_exchange_device(NULL, dev); 75 clockevents_exchange_device(NULL, dev);
74 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
75 if (!cpus_empty(tick_broadcast_mask)) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
76 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
77 return 1; 79 return 1;
78} 80}
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
104 */ 106 */
105 if (!tick_device_is_functional(dev)) { 107 if (!tick_device_is_functional(dev)) {
106 dev->event_handler = tick_handle_periodic; 108 dev->event_handler = tick_handle_periodic;
107 cpu_set(cpu, tick_broadcast_mask); 109 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
108 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 110 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
109 ret = 1; 111 ret = 1;
110 } else { 112 } else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 118 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
117 int cpu = smp_processor_id(); 119 int cpu = smp_processor_id();
118 120
119 cpu_clear(cpu, tick_broadcast_mask); 121 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
120 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
121 } 123 }
122 } 124 }
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125} 127}
126 128
127/* 129/*
128 * Broadcast the event to the cpus, which are set in the mask 130 * Broadcast the event to the cpus, which are set in the mask (mangled).
129 */ 131 */
130static void tick_do_broadcast(cpumask_t mask) 132static void tick_do_broadcast(struct cpumask *mask)
131{ 133{
132 int cpu = smp_processor_id(); 134 int cpu = smp_processor_id();
133 struct tick_device *td; 135 struct tick_device *td;
@@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask)
135 /* 137 /*
136 * Check, if the current cpu is in the mask 138 * Check, if the current cpu is in the mask
137 */ 139 */
138 if (cpu_isset(cpu, mask)) { 140 if (cpumask_test_cpu(cpu, mask)) {
139 cpu_clear(cpu, mask); 141 cpumask_clear_cpu(cpu, mask);
140 td = &per_cpu(tick_cpu_device, cpu); 142 td = &per_cpu(tick_cpu_device, cpu);
141 td->evtdev->event_handler(td->evtdev); 143 td->evtdev->event_handler(td->evtdev);
142 } 144 }
143 145
144 if (!cpus_empty(mask)) { 146 if (!cpumask_empty(mask)) {
145 /* 147 /*
146 * It might be necessary to actually check whether the devices 148 * It might be necessary to actually check whether the devices
147 * have different broadcast functions. For now, just use the 149 * have different broadcast functions. For now, just use the
148 * one of the first device. This works as long as we have this 150 * one of the first device. This works as long as we have this
149 * misfeature only on x86 (lapic) 151 * misfeature only on x86 (lapic)
150 */ 152 */
151 cpu = first_cpu(mask); 153 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
152 td = &per_cpu(tick_cpu_device, cpu); 154 td->evtdev->broadcast(mask);
153 td->evtdev->broadcast(&mask);
154 } 155 }
155} 156}
156 157
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
160 */ 161 */
161static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
162{ 163{
163 cpumask_t mask;
164
165 spin_lock(&tick_broadcast_lock); 164 spin_lock(&tick_broadcast_lock);
166 165
167 cpus_and(mask, cpu_online_map, tick_broadcast_mask); 166 cpumask_and(to_cpumask(tmpmask),
168 tick_do_broadcast(mask); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 spin_unlock(&tick_broadcast_lock);
171} 171}
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
228 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
229 goto out; 229 goto out;
230 230
231 bc_stopped = cpus_empty(tick_broadcast_mask); 231 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
232 232
233 switch (*reason) { 233 switch (*reason) {
234 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
238 if (tick_broadcast_device.mode == 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC) 239 TICKDEV_MODE_PERIODIC)
240 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
244 break; 244 break;
245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
249 if (tick_broadcast_device.mode == 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC) 250 TICKDEV_MODE_PERIODIC)
251 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
253 break; 253 break;
254 } 254 }
255 255
256 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpumask_empty(tick_get_broadcast_mask())) {
257 if (!bc_stopped) 257 if (!bc_stopped)
258 clockevents_shutdown(bc); 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) { 259 } else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
272 */ 272 */
273void tick_broadcast_on_off(unsigned long reason, int *oncpu) 273void tick_broadcast_on_off(unsigned long reason, int *oncpu)
274{ 274{
275 if (!cpu_isset(*oncpu, cpu_online_map)) 275 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
303 spin_lock_irqsave(&tick_broadcast_lock, flags); 303 spin_lock_irqsave(&tick_broadcast_lock, flags);
304 304
305 bc = tick_broadcast_device.evtdev; 305 bc = tick_broadcast_device.evtdev;
306 cpu_clear(cpu, tick_broadcast_mask); 306 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
307 307
308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
309 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpumask_empty(tick_get_broadcast_mask()))
310 clockevents_shutdown(bc); 310 clockevents_shutdown(bc);
311 } 311 }
312 312
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
342 342
343 switch (tick_broadcast_device.mode) { 343 switch (tick_broadcast_device.mode) {
344 case TICKDEV_MODE_PERIODIC: 344 case TICKDEV_MODE_PERIODIC:
345 if(!cpus_empty(tick_broadcast_mask)) 345 if (!cpumask_empty(tick_get_broadcast_mask()))
346 tick_broadcast_start_periodic(bc); 346 tick_broadcast_start_periodic(bc);
347 broadcast = cpu_isset(smp_processor_id(), 347 broadcast = cpumask_test_cpu(smp_processor_id(),
348 tick_broadcast_mask); 348 tick_get_broadcast_mask());
349 break; 349 break;
350 case TICKDEV_MODE_ONESHOT: 350 case TICKDEV_MODE_ONESHOT:
351 broadcast = tick_resume_broadcast_oneshot(bc); 351 broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
360 360
361#ifdef CONFIG_TICK_ONESHOT 361#ifdef CONFIG_TICK_ONESHOT
362 362
363static cpumask_t tick_broadcast_oneshot_mask; 363/* FIXME: use cpumask_var_t. */
364static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
364 365
365/* 366/*
366 * Debugging: see timer_list.c 367 * Exposed for debugging: see timer_list.c
367 */ 368 */
368cpumask_t *tick_get_broadcast_oneshot_mask(void) 369struct cpumask *tick_get_broadcast_oneshot_mask(void)
369{ 370{
370 return &tick_broadcast_oneshot_mask; 371 return to_cpumask(tick_broadcast_oneshot_mask);
371} 372}
372 373
373static int tick_broadcast_set_event(ktime_t expires, int force) 374static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
389 */ 390 */
390void tick_check_oneshot_broadcast(int cpu) 391void tick_check_oneshot_broadcast(int cpu)
391{ 392{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 393 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 394 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394 395
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 396 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 403static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
403{ 404{
404 struct tick_device *td; 405 struct tick_device *td;
405 cpumask_t mask;
406 ktime_t now, next_event; 406 ktime_t now, next_event;
407 int cpu; 407 int cpu;
408 408
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
410again: 410again:
411 dev->next_event.tv64 = KTIME_MAX; 411 dev->next_event.tv64 = KTIME_MAX;
412 next_event.tv64 = KTIME_MAX; 412 next_event.tv64 = KTIME_MAX;
413 mask = CPU_MASK_NONE; 413 cpumask_clear(to_cpumask(tmpmask));
414 now = ktime_get(); 414 now = ktime_get();
415 /* Find all expired events */ 415 /* Find all expired events */
416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { 416 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
417 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
418 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
419 cpu_set(cpu, mask); 419 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
420 else if (td->evtdev->next_event.tv64 < next_event.tv64) 420 else if (td->evtdev->next_event.tv64 < next_event.tv64)
421 next_event.tv64 = td->evtdev->next_event.tv64; 421 next_event.tv64 = td->evtdev->next_event.tv64;
422 } 422 }
@@ -424,7 +424,7 @@ again:
424 /* 424 /*
425 * Wakeup the cpus which have an expired event. 425 * Wakeup the cpus which have an expired event.
426 */ 426 */
427 tick_do_broadcast(mask); 427 tick_do_broadcast(to_cpumask(tmpmask));
428 428
429 /* 429 /*
430 * Two reasons for reprogram: 430 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
476 goto out; 476 goto out;
477 477
478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
479 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 479 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
480 cpu_set(cpu, tick_broadcast_oneshot_mask); 480 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
482 if (dev->next_event.tv64 < bc->next_event.tv64) 482 if (dev->next_event.tv64 < bc->next_event.tv64)
483 tick_broadcast_set_event(dev->next_event, 1); 483 tick_broadcast_set_event(dev->next_event, 1);
484 } 484 }
485 } else { 485 } else {
486 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 486 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
487 cpu_clear(cpu, tick_broadcast_oneshot_mask); 487 cpumask_clear_cpu(cpu,
488 tick_get_broadcast_oneshot_mask());
488 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 489 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
489 if (dev->next_event.tv64 != KTIME_MAX) 490 if (dev->next_event.tv64 != KTIME_MAX)
490 tick_program_event(dev->next_event, 1); 491 tick_program_event(dev->next_event, 1);
@@ -502,15 +503,16 @@ out:
502 */ 503 */
503static void tick_broadcast_clear_oneshot(int cpu) 504static void tick_broadcast_clear_oneshot(int cpu)
504{ 505{
505 cpu_clear(cpu, tick_broadcast_oneshot_mask); 506 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
506} 507}
507 508
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) 509static void tick_broadcast_init_next_event(struct cpumask *mask,
510 ktime_t expires)
509{ 511{
510 struct tick_device *td; 512 struct tick_device *td;
511 int cpu; 513 int cpu;
512 514
513 for_each_cpu_mask_nr(cpu, *mask) { 515 for_each_cpu(cpu, mask) {
514 td = &per_cpu(tick_cpu_device, cpu); 516 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev) 517 if (td->evtdev)
516 td->evtdev->next_event = expires; 518 td->evtdev->next_event = expires;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id(); 530 int cpu = smp_processor_id();
529 cpumask_t mask;
530 531
531 bc->event_handler = tick_handle_oneshot_broadcast; 532 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
540 * oneshot_mask bits for those and program the 541 * oneshot_mask bits for those and program the
541 * broadcast device to fire. 542 * broadcast device to fire.
542 */ 543 */
543 mask = tick_broadcast_mask; 544 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
544 cpu_clear(cpu, mask); 545 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
545 cpus_or(tick_broadcast_oneshot_mask, 546 cpumask_or(tick_get_broadcast_oneshot_mask(),
546 tick_broadcast_oneshot_mask, mask); 547 tick_get_broadcast_oneshot_mask(),
547 548 to_cpumask(tmpmask));
548 if (was_periodic && !cpus_empty(mask)) { 549
549 tick_broadcast_init_next_event(&mask, tick_next_period); 550 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
551 tick_broadcast_init_next_event(to_cpumask(tmpmask),
552 tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1); 553 tick_broadcast_set_event(tick_next_period, 1);
551 } else 554 } else
552 bc->next_event.tv64 = KTIME_MAX; 555 bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
585 * Clear the broadcast mask flag for the dead cpu, but do not 588 * Clear the broadcast mask flag for the dead cpu, but do not
586 * stop the broadcast device! 589 * stop the broadcast device!
587 */ 590 */
588 cpu_clear(cpu, tick_broadcast_oneshot_mask); 591 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
589 592
590 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 593 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
591} 594}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f8372be74122..63e05d423a09 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); 257 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
299 } 299 }
300 /* Transfer the do_timer job away from this cpu */ 300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = cpumask_first(cpu_online_mask);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : 304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE; 305 TICK_DO_TIMER_NONE;
306 } 306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 70f872c71f4e..76a574bbef97 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpumask_set_cpu(cpu, nohz_cpu_mask); 304 cpumask_set_cpu(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -681,7 +688,6 @@ void tick_setup_sched_timer(void)
681 */ 688 */
682 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
683 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
684 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
685 691
686 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
687 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/timer.c b/kernel/timer.c
index dbd50fabe4c7..566257d1dc10 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1192,25 +1192,25 @@ asmlinkage long sys_getppid(void)
1192asmlinkage long sys_getuid(void) 1192asmlinkage long sys_getuid(void)
1193{ 1193{
1194 /* Only we change this so SMP safe */ 1194 /* Only we change this so SMP safe */
1195 return current->uid; 1195 return current_uid();
1196} 1196}
1197 1197
1198asmlinkage long sys_geteuid(void) 1198asmlinkage long sys_geteuid(void)
1199{ 1199{
1200 /* Only we change this so SMP safe */ 1200 /* Only we change this so SMP safe */
1201 return current->euid; 1201 return current_euid();
1202} 1202}
1203 1203
1204asmlinkage long sys_getgid(void) 1204asmlinkage long sys_getgid(void)
1205{ 1205{
1206 /* Only we change this so SMP safe */ 1206 /* Only we change this so SMP safe */
1207 return current->gid; 1207 return current_gid();
1208} 1208}
1209 1209
1210asmlinkage long sys_getegid(void) 1210asmlinkage long sys_getegid(void)
1211{ 1211{
1212 /* Only we change this so SMP safe */ 1212 /* Only we change this so SMP safe */
1213 return current->egid; 1213 return current_egid();
1214} 1214}
1215 1215
1216#endif 1216#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bde6f03512d5..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -244,16 +244,21 @@ config STACK_TRACER
244 244
245 This tracer works by hooking into every function call that the 245 This tracer works by hooking into every function call that the
246 kernel executes, and keeping a maximum stack depth value and 246 kernel executes, and keeping a maximum stack depth value and
247 stack-trace saved. Because this logic has to execute in every 247 stack-trace saved. If this is configured with DYNAMIC_FTRACE
248 kernel function, all the time, this option can slow down the 248 then it will not have any overhead while the stack tracer
249 kernel measurably and is generally intended for kernel 249 is disabled.
250 developers only. 250
251 To enable the stack tracer on bootup, pass in 'stacktrace'
252 on the kernel command line.
253
254 The stack tracer can also be enabled or disabled via the
255 sysctl kernel.stack_tracer_enabled
251 256
252 Say N if unsure. 257 Say N if unsure.
253 258
254config BTS_TRACER 259config HW_BRANCH_TRACER
255 depends on HAVE_HW_BRANCH_TRACER 260 depends on HAVE_HW_BRANCH_TRACER
256 bool "Trace branches" 261 bool "Trace hw branches"
257 select TRACING 262 select TRACING
258 help 263 help
259 This tracer records all branches on the system in a circular 264 This tracer records all branches on the system in a circular
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 62dc561b6676..349d5a93653f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_BTS_TRACER) += trace_bts.o 34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o 35obj-$(CONFIG_POWER_TRACER) += trace_power.o
36 36
37libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12f80efceaa..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
1047 int type = MATCH_FULL; 1047 int type = MATCH_FULL;
1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1049 unsigned i, match = 0, search_len = 0; 1049 unsigned i, match = 0, search_len = 0;
1050 int not = 0;
1051
1052 if (buff[0] == '!') {
1053 not = 1;
1054 buff++;
1055 len--;
1056 }
1050 1057
1051 for (i = 0; i < len; i++) { 1058 for (i = 0; i < len; i++) {
1052 if (buff[i] == '*') { 1059 if (buff[i] == '*') {
@@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
1100 matched = 1; 1107 matched = 1;
1101 break; 1108 break;
1102 } 1109 }
1103 if (matched) 1110 if (matched) {
1104 rec->flags |= flag; 1111 if (not)
1112 rec->flags &= ~flag;
1113 else
1114 rec->flags |= flag;
1115 }
1105 } 1116 }
1106 pg = pg->next; 1117 pg = pg->next;
1107 } 1118 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f69cfeaadf7..a9d9760dc7b6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -69,6 +69,7 @@ void tracing_on(void)
69{ 69{
70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
71} 71}
72EXPORT_SYMBOL_GPL(tracing_on);
72 73
73/** 74/**
74 * tracing_off - turn off all tracing buffers 75 * tracing_off - turn off all tracing buffers
@@ -82,6 +83,7 @@ void tracing_off(void)
82{ 83{
83 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); 84 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
84} 85}
86EXPORT_SYMBOL_GPL(tracing_off);
85 87
86/** 88/**
87 * tracing_off_permanent - permanently disable ring buffers 89 * tracing_off_permanent - permanently disable ring buffers
@@ -107,16 +109,18 @@ u64 ring_buffer_time_stamp(int cpu)
107 preempt_disable_notrace(); 109 preempt_disable_notrace();
108 /* shift to debug/test normalization and TIME_EXTENTS */ 110 /* shift to debug/test normalization and TIME_EXTENTS */
109 time = sched_clock() << DEBUG_SHIFT; 111 time = sched_clock() << DEBUG_SHIFT;
110 preempt_enable_notrace(); 112 preempt_enable_no_resched_notrace();
111 113
112 return time; 114 return time;
113} 115}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
114 117
115void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
116{ 119{
117 /* Just stupid testing the normalize function and deltas */ 120 /* Just stupid testing the normalize function and deltas */
118 *ts >>= DEBUG_SHIFT; 121 *ts >>= DEBUG_SHIFT;
119} 122}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
120 124
121#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
122#define RB_ALIGNMENT_SHIFT 2 126#define RB_ALIGNMENT_SHIFT 2
@@ -166,6 +170,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
166{ 170{
167 return rb_event_length(event); 171 return rb_event_length(event);
168} 172}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length);
169 174
170/* inline for ring buffer fast paths */ 175/* inline for ring buffer fast paths */
171static inline void * 176static inline void *
@@ -187,9 +192,10 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
187{ 192{
188 return rb_event_data(event); 193 return rb_event_data(event);
189} 194}
195EXPORT_SYMBOL_GPL(ring_buffer_event_data);
190 196
191#define for_each_buffer_cpu(buffer, cpu) \ 197#define for_each_buffer_cpu(buffer, cpu) \
192 for_each_cpu_mask(cpu, buffer->cpumask) 198 for_each_cpu(cpu, buffer->cpumask)
193 199
194#define TS_SHIFT 27 200#define TS_SHIFT 27
195#define TS_MASK ((1ULL << TS_SHIFT) - 1) 201#define TS_MASK ((1ULL << TS_SHIFT) - 1)
@@ -258,11 +264,10 @@ struct ring_buffer_per_cpu {
258}; 264};
259 265
260struct ring_buffer { 266struct ring_buffer {
261 unsigned long size;
262 unsigned pages; 267 unsigned pages;
263 unsigned flags; 268 unsigned flags;
264 int cpus; 269 int cpus;
265 cpumask_t cpumask; 270 cpumask_var_t cpumask;
266 atomic_t record_disabled; 271 atomic_t record_disabled;
267 272
268 struct mutex mutex; 273 struct mutex mutex;
@@ -428,7 +433,7 @@ extern int ring_buffer_page_too_big(void);
428 433
429/** 434/**
430 * ring_buffer_alloc - allocate a new ring_buffer 435 * ring_buffer_alloc - allocate a new ring_buffer
431 * @size: the size in bytes that is needed. 436 * @size: the size in bytes per cpu that is needed.
432 * @flags: attributes to set for the ring buffer. 437 * @flags: attributes to set for the ring buffer.
433 * 438 *
434 * Currently the only flag that is available is the RB_FL_OVERWRITE 439 * Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -453,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
453 if (!buffer) 458 if (!buffer)
454 return NULL; 459 return NULL;
455 460
461 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
462 goto fail_free_buffer;
463
456 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 464 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
457 buffer->flags = flags; 465 buffer->flags = flags;
458 466
@@ -460,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
460 if (buffer->pages == 1) 468 if (buffer->pages == 1)
461 buffer->pages++; 469 buffer->pages++;
462 470
463 buffer->cpumask = cpu_possible_map; 471 cpumask_copy(buffer->cpumask, cpu_possible_mask);
464 buffer->cpus = nr_cpu_ids; 472 buffer->cpus = nr_cpu_ids;
465 473
466 bsize = sizeof(void *) * nr_cpu_ids; 474 bsize = sizeof(void *) * nr_cpu_ids;
467 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 475 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
468 GFP_KERNEL); 476 GFP_KERNEL);
469 if (!buffer->buffers) 477 if (!buffer->buffers)
470 goto fail_free_buffer; 478 goto fail_free_cpumask;
471 479
472 for_each_buffer_cpu(buffer, cpu) { 480 for_each_buffer_cpu(buffer, cpu) {
473 buffer->buffers[cpu] = 481 buffer->buffers[cpu] =
@@ -487,10 +495,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
487 } 495 }
488 kfree(buffer->buffers); 496 kfree(buffer->buffers);
489 497
498 fail_free_cpumask:
499 free_cpumask_var(buffer->cpumask);
500
490 fail_free_buffer: 501 fail_free_buffer:
491 kfree(buffer); 502 kfree(buffer);
492 return NULL; 503 return NULL;
493} 504}
505EXPORT_SYMBOL_GPL(ring_buffer_alloc);
494 506
495/** 507/**
496 * ring_buffer_free - free a ring buffer. 508 * ring_buffer_free - free a ring buffer.
@@ -504,8 +516,11 @@ ring_buffer_free(struct ring_buffer *buffer)
504 for_each_buffer_cpu(buffer, cpu) 516 for_each_buffer_cpu(buffer, cpu)
505 rb_free_cpu_buffer(buffer->buffers[cpu]); 517 rb_free_cpu_buffer(buffer->buffers[cpu]);
506 518
519 free_cpumask_var(buffer->cpumask);
520
507 kfree(buffer); 521 kfree(buffer);
508} 522}
523EXPORT_SYMBOL_GPL(ring_buffer_free);
509 524
510static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 525static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
511 526
@@ -681,6 +696,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
681 mutex_unlock(&buffer->mutex); 696 mutex_unlock(&buffer->mutex);
682 return -ENOMEM; 697 return -ENOMEM;
683} 698}
699EXPORT_SYMBOL_GPL(ring_buffer_resize);
684 700
685static inline int rb_null_event(struct ring_buffer_event *event) 701static inline int rb_null_event(struct ring_buffer_event *event)
686{ 702{
@@ -839,6 +855,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
839 * back to us). This allows us to do a simple loop to 855 * back to us). This allows us to do a simple loop to
840 * assign the commit to the tail. 856 * assign the commit to the tail.
841 */ 857 */
858 again:
842 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 859 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
843 cpu_buffer->commit_page->page->commit = 860 cpu_buffer->commit_page->page->commit =
844 cpu_buffer->commit_page->write; 861 cpu_buffer->commit_page->write;
@@ -854,6 +871,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
854 cpu_buffer->commit_page->write; 871 cpu_buffer->commit_page->write;
855 barrier(); 872 barrier();
856 } 873 }
874
875 /* again, keep gcc from optimizing */
876 barrier();
877
878 /*
879 * If an interrupt came in just after the first while loop
880 * and pushed the tail page forward, we will be left with
881 * a dangling commit that will never go forward.
882 */
883 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
884 goto again;
857} 885}
858 886
859static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 887static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
@@ -951,12 +979,15 @@ static struct ring_buffer_event *
951__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 979__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
952 unsigned type, unsigned long length, u64 *ts) 980 unsigned type, unsigned long length, u64 *ts)
953{ 981{
954 struct buffer_page *tail_page, *head_page, *reader_page; 982 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
955 unsigned long tail, write; 983 unsigned long tail, write;
956 struct ring_buffer *buffer = cpu_buffer->buffer; 984 struct ring_buffer *buffer = cpu_buffer->buffer;
957 struct ring_buffer_event *event; 985 struct ring_buffer_event *event;
958 unsigned long flags; 986 unsigned long flags;
959 987
988 commit_page = cpu_buffer->commit_page;
989 /* we just need to protect against interrupts */
990 barrier();
960 tail_page = cpu_buffer->tail_page; 991 tail_page = cpu_buffer->tail_page;
961 write = local_add_return(length, &tail_page->write); 992 write = local_add_return(length, &tail_page->write);
962 tail = write - length; 993 tail = write - length;
@@ -982,7 +1013,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
982 * it all the way around the buffer, bail, and warn 1013 * it all the way around the buffer, bail, and warn
983 * about it. 1014 * about it.
984 */ 1015 */
985 if (unlikely(next_page == cpu_buffer->commit_page)) { 1016 if (unlikely(next_page == commit_page)) {
986 WARN_ON_ONCE(1); 1017 WARN_ON_ONCE(1);
987 goto out_unlock; 1018 goto out_unlock;
988 } 1019 }
@@ -1260,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1260 1291
1261 cpu = raw_smp_processor_id(); 1292 cpu = raw_smp_processor_id();
1262 1293
1263 if (!cpu_isset(cpu, buffer->cpumask)) 1294 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1264 goto out; 1295 goto out;
1265 1296
1266 cpu_buffer = buffer->buffers[cpu]; 1297 cpu_buffer = buffer->buffers[cpu];
@@ -1290,6 +1321,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1290 ftrace_preempt_enable(resched); 1321 ftrace_preempt_enable(resched);
1291 return NULL; 1322 return NULL;
1292} 1323}
1324EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1293 1325
1294static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1326static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1295 struct ring_buffer_event *event) 1327 struct ring_buffer_event *event)
@@ -1336,6 +1368,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1336 1368
1337 return 0; 1369 return 0;
1338} 1370}
1371EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1339 1372
1340/** 1373/**
1341 * ring_buffer_write - write data to the buffer without reserving 1374 * ring_buffer_write - write data to the buffer without reserving
@@ -1371,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1371 1404
1372 cpu = raw_smp_processor_id(); 1405 cpu = raw_smp_processor_id();
1373 1406
1374 if (!cpu_isset(cpu, buffer->cpumask)) 1407 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1375 goto out; 1408 goto out;
1376 1409
1377 cpu_buffer = buffer->buffers[cpu]; 1410 cpu_buffer = buffer->buffers[cpu];
@@ -1397,6 +1430,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1397 1430
1398 return ret; 1431 return ret;
1399} 1432}
1433EXPORT_SYMBOL_GPL(ring_buffer_write);
1400 1434
1401static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1435static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1402{ 1436{
@@ -1423,6 +1457,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer)
1423{ 1457{
1424 atomic_inc(&buffer->record_disabled); 1458 atomic_inc(&buffer->record_disabled);
1425} 1459}
1460EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
1426 1461
1427/** 1462/**
1428 * ring_buffer_record_enable - enable writes to the buffer 1463 * ring_buffer_record_enable - enable writes to the buffer
@@ -1435,6 +1470,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
1435{ 1470{
1436 atomic_dec(&buffer->record_disabled); 1471 atomic_dec(&buffer->record_disabled);
1437} 1472}
1473EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
1438 1474
1439/** 1475/**
1440 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 1476 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
@@ -1450,12 +1486,13 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1450{ 1486{
1451 struct ring_buffer_per_cpu *cpu_buffer; 1487 struct ring_buffer_per_cpu *cpu_buffer;
1452 1488
1453 if (!cpu_isset(cpu, buffer->cpumask)) 1489 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1454 return; 1490 return;
1455 1491
1456 cpu_buffer = buffer->buffers[cpu]; 1492 cpu_buffer = buffer->buffers[cpu];
1457 atomic_inc(&cpu_buffer->record_disabled); 1493 atomic_inc(&cpu_buffer->record_disabled);
1458} 1494}
1495EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
1459 1496
1460/** 1497/**
1461 * ring_buffer_record_enable_cpu - enable writes to the buffer 1498 * ring_buffer_record_enable_cpu - enable writes to the buffer
@@ -1469,12 +1506,13 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1469{ 1506{
1470 struct ring_buffer_per_cpu *cpu_buffer; 1507 struct ring_buffer_per_cpu *cpu_buffer;
1471 1508
1472 if (!cpu_isset(cpu, buffer->cpumask)) 1509 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1473 return; 1510 return;
1474 1511
1475 cpu_buffer = buffer->buffers[cpu]; 1512 cpu_buffer = buffer->buffers[cpu];
1476 atomic_dec(&cpu_buffer->record_disabled); 1513 atomic_dec(&cpu_buffer->record_disabled);
1477} 1514}
1515EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1478 1516
1479/** 1517/**
1480 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 1518 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
@@ -1485,12 +1523,13 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1485{ 1523{
1486 struct ring_buffer_per_cpu *cpu_buffer; 1524 struct ring_buffer_per_cpu *cpu_buffer;
1487 1525
1488 if (!cpu_isset(cpu, buffer->cpumask)) 1526 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1489 return 0; 1527 return 0;
1490 1528
1491 cpu_buffer = buffer->buffers[cpu]; 1529 cpu_buffer = buffer->buffers[cpu];
1492 return cpu_buffer->entries; 1530 return cpu_buffer->entries;
1493} 1531}
1532EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1494 1533
1495/** 1534/**
1496 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 1535 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
@@ -1501,12 +1540,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1501{ 1540{
1502 struct ring_buffer_per_cpu *cpu_buffer; 1541 struct ring_buffer_per_cpu *cpu_buffer;
1503 1542
1504 if (!cpu_isset(cpu, buffer->cpumask)) 1543 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1505 return 0; 1544 return 0;
1506 1545
1507 cpu_buffer = buffer->buffers[cpu]; 1546 cpu_buffer = buffer->buffers[cpu];
1508 return cpu_buffer->overrun; 1547 return cpu_buffer->overrun;
1509} 1548}
1549EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1510 1550
1511/** 1551/**
1512 * ring_buffer_entries - get the number of entries in a buffer 1552 * ring_buffer_entries - get the number of entries in a buffer
@@ -1529,6 +1569,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1529 1569
1530 return entries; 1570 return entries;
1531} 1571}
1572EXPORT_SYMBOL_GPL(ring_buffer_entries);
1532 1573
1533/** 1574/**
1534 * ring_buffer_overrun_cpu - get the number of overruns in buffer 1575 * ring_buffer_overrun_cpu - get the number of overruns in buffer
@@ -1551,6 +1592,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1551 1592
1552 return overruns; 1593 return overruns;
1553} 1594}
1595EXPORT_SYMBOL_GPL(ring_buffer_overruns);
1554 1596
1555static void rb_iter_reset(struct ring_buffer_iter *iter) 1597static void rb_iter_reset(struct ring_buffer_iter *iter)
1556{ 1598{
@@ -1586,6 +1628,7 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1586 rb_iter_reset(iter); 1628 rb_iter_reset(iter);
1587 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 1629 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1588} 1630}
1631EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
1589 1632
1590/** 1633/**
1591 * ring_buffer_iter_empty - check if an iterator has no more to read 1634 * ring_buffer_iter_empty - check if an iterator has no more to read
@@ -1600,6 +1643,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1600 return iter->head_page == cpu_buffer->commit_page && 1643 return iter->head_page == cpu_buffer->commit_page &&
1601 iter->head == rb_commit_index(cpu_buffer); 1644 iter->head == rb_commit_index(cpu_buffer);
1602} 1645}
1646EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
1603 1647
1604static void 1648static void
1605rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1649rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1814,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1814 struct buffer_page *reader; 1858 struct buffer_page *reader;
1815 int nr_loops = 0; 1859 int nr_loops = 0;
1816 1860
1817 if (!cpu_isset(cpu, buffer->cpumask)) 1861 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1818 return NULL; 1862 return NULL;
1819 1863
1820 cpu_buffer = buffer->buffers[cpu]; 1864 cpu_buffer = buffer->buffers[cpu];
@@ -1866,6 +1910,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1866 1910
1867 return NULL; 1911 return NULL;
1868} 1912}
1913EXPORT_SYMBOL_GPL(ring_buffer_peek);
1869 1914
1870static struct ring_buffer_event * 1915static struct ring_buffer_event *
1871rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) 1916rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
@@ -1926,6 +1971,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1926 1971
1927 return NULL; 1972 return NULL;
1928} 1973}
1974EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1929 1975
1930/** 1976/**
1931 * ring_buffer_peek - peek at the next event to be read 1977 * ring_buffer_peek - peek at the next event to be read
@@ -1987,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1987 struct ring_buffer_event *event; 2033 struct ring_buffer_event *event;
1988 unsigned long flags; 2034 unsigned long flags;
1989 2035
1990 if (!cpu_isset(cpu, buffer->cpumask)) 2036 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1991 return NULL; 2037 return NULL;
1992 2038
1993 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2039 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2003,6 +2049,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2003 2049
2004 return event; 2050 return event;
2005} 2051}
2052EXPORT_SYMBOL_GPL(ring_buffer_consume);
2006 2053
2007/** 2054/**
2008 * ring_buffer_read_start - start a non consuming read of the buffer 2055 * ring_buffer_read_start - start a non consuming read of the buffer
@@ -2023,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2023 struct ring_buffer_iter *iter; 2070 struct ring_buffer_iter *iter;
2024 unsigned long flags; 2071 unsigned long flags;
2025 2072
2026 if (!cpu_isset(cpu, buffer->cpumask)) 2073 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2027 return NULL; 2074 return NULL;
2028 2075
2029 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 2076 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -2045,6 +2092,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2045 2092
2046 return iter; 2093 return iter;
2047} 2094}
2095EXPORT_SYMBOL_GPL(ring_buffer_read_start);
2048 2096
2049/** 2097/**
2050 * ring_buffer_finish - finish reading the iterator of the buffer 2098 * ring_buffer_finish - finish reading the iterator of the buffer
@@ -2061,6 +2109,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
2061 atomic_dec(&cpu_buffer->record_disabled); 2109 atomic_dec(&cpu_buffer->record_disabled);
2062 kfree(iter); 2110 kfree(iter);
2063} 2111}
2112EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
2064 2113
2065/** 2114/**
2066 * ring_buffer_read - read the next item in the ring buffer by the iterator 2115 * ring_buffer_read - read the next item in the ring buffer by the iterator
@@ -2087,6 +2136,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2087 2136
2088 return event; 2137 return event;
2089} 2138}
2139EXPORT_SYMBOL_GPL(ring_buffer_read);
2090 2140
2091/** 2141/**
2092 * ring_buffer_size - return the size of the ring buffer (in bytes) 2142 * ring_buffer_size - return the size of the ring buffer (in bytes)
@@ -2096,6 +2146,7 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer)
2096{ 2146{
2097 return BUF_PAGE_SIZE * buffer->pages; 2147 return BUF_PAGE_SIZE * buffer->pages;
2098} 2148}
2149EXPORT_SYMBOL_GPL(ring_buffer_size);
2099 2150
2100static void 2151static void
2101rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 2152rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
@@ -2129,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2129 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2180 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2130 unsigned long flags; 2181 unsigned long flags;
2131 2182
2132 if (!cpu_isset(cpu, buffer->cpumask)) 2183 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2133 return; 2184 return;
2134 2185
2135 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2186 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2142,6 +2193,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2142 2193
2143 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2194 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2144} 2195}
2196EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2145 2197
2146/** 2198/**
2147 * ring_buffer_reset - reset a ring buffer 2199 * ring_buffer_reset - reset a ring buffer
@@ -2154,6 +2206,7 @@ void ring_buffer_reset(struct ring_buffer *buffer)
2154 for_each_buffer_cpu(buffer, cpu) 2206 for_each_buffer_cpu(buffer, cpu)
2155 ring_buffer_reset_cpu(buffer, cpu); 2207 ring_buffer_reset_cpu(buffer, cpu);
2156} 2208}
2209EXPORT_SYMBOL_GPL(ring_buffer_reset);
2157 2210
2158/** 2211/**
2159 * rind_buffer_empty - is the ring buffer empty? 2212 * rind_buffer_empty - is the ring buffer empty?
@@ -2172,6 +2225,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2172 } 2225 }
2173 return 1; 2226 return 1;
2174} 2227}
2228EXPORT_SYMBOL_GPL(ring_buffer_empty);
2175 2229
2176/** 2230/**
2177 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? 2231 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
@@ -2182,12 +2236,13 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2182{ 2236{
2183 struct ring_buffer_per_cpu *cpu_buffer; 2237 struct ring_buffer_per_cpu *cpu_buffer;
2184 2238
2185 if (!cpu_isset(cpu, buffer->cpumask)) 2239 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2186 return 1; 2240 return 1;
2187 2241
2188 cpu_buffer = buffer->buffers[cpu]; 2242 cpu_buffer = buffer->buffers[cpu];
2189 return rb_per_cpu_empty(cpu_buffer); 2243 return rb_per_cpu_empty(cpu_buffer);
2190} 2244}
2245EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2191 2246
2192/** 2247/**
2193 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 2248 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -2205,13 +2260,12 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2205 struct ring_buffer_per_cpu *cpu_buffer_a; 2260 struct ring_buffer_per_cpu *cpu_buffer_a;
2206 struct ring_buffer_per_cpu *cpu_buffer_b; 2261 struct ring_buffer_per_cpu *cpu_buffer_b;
2207 2262
2208 if (!cpu_isset(cpu, buffer_a->cpumask) || 2263 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2209 !cpu_isset(cpu, buffer_b->cpumask)) 2264 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2210 return -EINVAL; 2265 return -EINVAL;
2211 2266
2212 /* At least make sure the two buffers are somewhat the same */ 2267 /* At least make sure the two buffers are somewhat the same */
2213 if (buffer_a->size != buffer_b->size || 2268 if (buffer_a->pages != buffer_b->pages)
2214 buffer_a->pages != buffer_b->pages)
2215 return -EINVAL; 2269 return -EINVAL;
2216 2270
2217 cpu_buffer_a = buffer_a->buffers[cpu]; 2271 cpu_buffer_a = buffer_a->buffers[cpu];
@@ -2237,6 +2291,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2237 2291
2238 return 0; 2292 return 0;
2239} 2293}
2294EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2240 2295
2241static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, 2296static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2242 struct buffer_data_page *bpage) 2297 struct buffer_data_page *bpage)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6adf660fc816..c580233add95 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,7 +30,6 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
34#include <linux/writeback.h> 33#include <linux/writeback.h>
35 34
36#include <linux/stacktrace.h> 35#include <linux/stacktrace.h>
@@ -90,10 +89,10 @@ static inline void ftrace_enable_cpu(void)
90 preempt_enable(); 89 preempt_enable();
91} 90}
92 91
93static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_var_t __read_mostly tracing_buffer_mask;
94 93
95#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
96 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu(cpu, tracing_buffer_mask)
97 96
98/* 97/*
99 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -287,6 +286,7 @@ static const char *trace_options[] = {
287 "annotate", 286 "annotate",
288 "userstacktrace", 287 "userstacktrace",
289 "sym-userobj", 288 "sym-userobj",
289 "printk-msg-only",
290 NULL 290 NULL
291}; 291};
292 292
@@ -320,7 +320,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
320 320
321 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 321 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
322 data->pid = tsk->pid; 322 data->pid = tsk->pid;
323 data->uid = tsk->uid; 323 data->uid = task_uid(tsk);
324 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 324 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
325 data->policy = tsk->policy; 325 data->policy = tsk->policy;
326 data->rt_priority = tsk->rt_priority; 326 data->rt_priority = tsk->rt_priority;
@@ -678,6 +678,16 @@ void tracing_reset(struct trace_array *tr, int cpu)
678 ftrace_enable_cpu(); 678 ftrace_enable_cpu();
679} 679}
680 680
681void tracing_reset_online_cpus(struct trace_array *tr)
682{
683 int cpu;
684
685 tr->time_start = ftrace_now(tr->cpu);
686
687 for_each_online_cpu(cpu)
688 tracing_reset(tr, cpu);
689}
690
681#define SAVED_CMDLINES 128 691#define SAVED_CMDLINES 128
682static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 692static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
683static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 693static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@ -1299,7 +1309,7 @@ enum trace_file_type {
1299 TRACE_FILE_ANNOTATE = 2, 1309 TRACE_FILE_ANNOTATE = 2,
1300}; 1310};
1301 1311
1302static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1312static void trace_iterator_increment(struct trace_iterator *iter)
1303{ 1313{
1304 /* Don't allow ftrace to trace into the ring buffers */ 1314 /* Don't allow ftrace to trace into the ring buffers */
1305 ftrace_disable_cpu(); 1315 ftrace_disable_cpu();
@@ -1378,7 +1388,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
1378 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1388 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
1379 1389
1380 if (iter->ent) 1390 if (iter->ent)
1381 trace_iterator_increment(iter, iter->cpu); 1391 trace_iterator_increment(iter);
1382 1392
1383 return iter->ent ? iter : NULL; 1393 return iter->ent ? iter : NULL;
1384} 1394}
@@ -1747,6 +1757,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1747 1757
1748static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1749 1759
1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1750/* 1767/*
1751 * The message is supposed to contain an ending newline. 1768 * The message is supposed to contain an ending newline.
1752 * If the printing stops prematurely, try to add a newline of our own. 1769 * If the printing stops prematurely, try to add a newline of our own.
@@ -1794,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1794 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) 1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1795 return; 1812 return;
1796 1813
1797 if (cpu_isset(iter->cpu, iter->started)) 1814 if (cpumask_test_cpu(iter->cpu, iter->started))
1798 return; 1815 return;
1799 1816
1800 cpu_set(iter->cpu, iter->started); 1817 cpumask_set_cpu(iter->cpu, iter->started);
1801 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1802} 1819}
1803 1820
@@ -1815,7 +1832,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1815 char *comm; 1832 char *comm;
1816 int S, T; 1833 int S, T;
1817 int i; 1834 int i;
1818 unsigned state;
1819 1835
1820 if (entry->type == TRACE_CONT) 1836 if (entry->type == TRACE_CONT)
1821 return TRACE_TYPE_HANDLED; 1837 return TRACE_TYPE_HANDLED;
@@ -1861,12 +1877,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1861 1877
1862 trace_assign_type(field, entry); 1878 trace_assign_type(field, entry);
1863 1879
1864 T = field->next_state < sizeof(state_to_char) ? 1880 T = task_state_char(field->next_state);
1865 state_to_char[field->next_state] : 'X'; 1881 S = task_state_char(field->prev_state);
1866
1867 state = field->prev_state ?
1868 __ffs(field->prev_state) + 1 : 0;
1869 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1870 comm = trace_find_cmdline(field->next_pid); 1882 comm = trace_find_cmdline(field->next_pid);
1871 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1872 field->prev_pid, 1884 field->prev_pid,
@@ -2007,10 +2019,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
2007 2019
2008 trace_assign_type(field, entry); 2020 trace_assign_type(field, entry);
2009 2021
2010 S = field->prev_state < sizeof(state_to_char) ? 2022 T = task_state_char(field->next_state);
2011 state_to_char[field->prev_state] : 'X'; 2023 S = task_state_char(field->prev_state);
2012 T = field->next_state < sizeof(state_to_char) ?
2013 state_to_char[field->next_state] : 'X';
2014 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", 2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
2015 field->prev_pid, 2025 field->prev_pid,
2016 field->prev_prio, 2026 field->prev_prio,
@@ -2140,12 +2150,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2140 2150
2141 trace_assign_type(field, entry); 2151 trace_assign_type(field, entry);
2142 2152
2143 S = field->prev_state < sizeof(state_to_char) ? 2153 T = task_state_char(field->next_state);
2144 state_to_char[field->prev_state] : 'X'; 2154 S = entry->type == TRACE_WAKE ? '+' :
2145 T = field->next_state < sizeof(state_to_char) ? 2155 task_state_char(field->prev_state);
2146 state_to_char[field->next_state] : 'X';
2147 if (entry->type == TRACE_WAKE)
2148 S = '+';
2149 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", 2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
2150 field->prev_pid, 2157 field->prev_pid,
2151 field->prev_prio, 2158 field->prev_prio,
@@ -2232,12 +2239,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2232 2239
2233 trace_assign_type(field, entry); 2240 trace_assign_type(field, entry);
2234 2241
2235 S = field->prev_state < sizeof(state_to_char) ? 2242 T = task_state_char(field->next_state);
2236 state_to_char[field->prev_state] : 'X'; 2243 S = entry->type == TRACE_WAKE ? '+' :
2237 T = field->next_state < sizeof(state_to_char) ? 2244 task_state_char(field->prev_state);
2238 state_to_char[field->next_state] : 'X';
2239 if (entry->type == TRACE_WAKE)
2240 S = '+';
2241 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
2242 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
2243 SEQ_PUT_HEX_FIELD_RET(s, S); 2247 SEQ_PUT_HEX_FIELD_RET(s, S);
@@ -2265,6 +2269,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2265 return TRACE_TYPE_HANDLED; 2269 return TRACE_TYPE_HANDLED;
2266} 2270}
2267 2271
2272static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2273{
2274 struct trace_seq *s = &iter->seq;
2275 struct trace_entry *entry = iter->ent;
2276 struct print_entry *field;
2277 int ret;
2278
2279 trace_assign_type(field, entry);
2280
2281 ret = trace_seq_printf(s, field->buf);
2282 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE;
2284
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287
2288 return TRACE_TYPE_HANDLED;
2289}
2290
2268static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2291static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2269{ 2292{
2270 struct trace_seq *s = &iter->seq; 2293 struct trace_seq *s = &iter->seq;
@@ -2345,6 +2368,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2345 return ret; 2368 return ret;
2346 } 2369 }
2347 2370
2371 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2374 return print_printk_msg_only(iter);
2375
2348 if (trace_flags & TRACE_ITER_BIN) 2376 if (trace_flags & TRACE_ITER_BIN)
2349 return print_bin_fmt(iter); 2377 return print_bin_fmt(iter);
2350 2378
@@ -2425,7 +2453,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2425 2453
2426 /* Notify the tracer early; before we stop tracing. */ 2454 /* Notify the tracer early; before we stop tracing. */
2427 if (iter->trace && iter->trace->open) 2455 if (iter->trace && iter->trace->open)
2428 iter->trace->open(iter); 2456 iter->trace->open(iter);
2429 2457
2430 /* Annotate start of buffers if we had overruns */ 2458 /* Annotate start of buffers if we had overruns */
2431 if (ring_buffer_overruns(iter->tr->buffer)) 2459 if (ring_buffer_overruns(iter->tr->buffer))
@@ -2618,13 +2646,7 @@ static struct file_operations show_traces_fops = {
2618/* 2646/*
2619 * Only trace on a CPU if the bitmask is set: 2647 * Only trace on a CPU if the bitmask is set:
2620 */ 2648 */
2621static cpumask_t tracing_cpumask = CPU_MASK_ALL; 2649static cpumask_var_t tracing_cpumask;
2622
2623/*
2624 * When tracing/tracing_cpu_mask is modified then this holds
2625 * the new bitmask we are about to install:
2626 */
2627static cpumask_t tracing_cpumask_new;
2628 2650
2629/* 2651/*
2630 * The tracer itself will not take this lock, but still we want 2652 * The tracer itself will not take this lock, but still we want
@@ -2646,7 +2668,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
2646 2668
2647 mutex_lock(&tracing_cpumask_update_lock); 2669 mutex_lock(&tracing_cpumask_update_lock);
2648 2670
2649 len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); 2671 len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
2650 if (count - len < 2) { 2672 if (count - len < 2) {
2651 count = -EINVAL; 2673 count = -EINVAL;
2652 goto out_err; 2674 goto out_err;
@@ -2665,9 +2687,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2665 size_t count, loff_t *ppos) 2687 size_t count, loff_t *ppos)
2666{ 2688{
2667 int err, cpu; 2689 int err, cpu;
2690 cpumask_var_t tracing_cpumask_new;
2691
2692 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2693 return -ENOMEM;
2668 2694
2669 mutex_lock(&tracing_cpumask_update_lock); 2695 mutex_lock(&tracing_cpumask_update_lock);
2670 err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); 2696 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2671 if (err) 2697 if (err)
2672 goto err_unlock; 2698 goto err_unlock;
2673 2699
@@ -2678,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2678 * Increase/decrease the disabled counter if we are 2704 * Increase/decrease the disabled counter if we are
2679 * about to flip a bit in the cpumask: 2705 * about to flip a bit in the cpumask:
2680 */ 2706 */
2681 if (cpu_isset(cpu, tracing_cpumask) && 2707 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2682 !cpu_isset(cpu, tracing_cpumask_new)) { 2708 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2683 atomic_inc(&global_trace.data[cpu]->disabled); 2709 atomic_inc(&global_trace.data[cpu]->disabled);
2684 } 2710 }
2685 if (!cpu_isset(cpu, tracing_cpumask) && 2711 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2686 cpu_isset(cpu, tracing_cpumask_new)) { 2712 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2687 atomic_dec(&global_trace.data[cpu]->disabled); 2713 atomic_dec(&global_trace.data[cpu]->disabled);
2688 } 2714 }
2689 } 2715 }
2690 __raw_spin_unlock(&ftrace_max_lock); 2716 __raw_spin_unlock(&ftrace_max_lock);
2691 local_irq_enable(); 2717 local_irq_enable();
2692 2718
2693 tracing_cpumask = tracing_cpumask_new; 2719 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
2694 2720
2695 mutex_unlock(&tracing_cpumask_update_lock); 2721 mutex_unlock(&tracing_cpumask_update_lock);
2722 free_cpumask_var(tracing_cpumask_new);
2696 2723
2697 return count; 2724 return count;
2698 2725
2699err_unlock: 2726err_unlock:
2700 mutex_unlock(&tracing_cpumask_update_lock); 2727 mutex_unlock(&tracing_cpumask_update_lock);
2728 free_cpumask_var(tracing_cpumask);
2701 2729
2702 return err; 2730 return err;
2703} 2731}
@@ -3086,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3086 if (!iter) 3114 if (!iter)
3087 return -ENOMEM; 3115 return -ENOMEM;
3088 3116
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3118 kfree(iter);
3119 return -ENOMEM;
3120 }
3121
3089 mutex_lock(&trace_types_lock); 3122 mutex_lock(&trace_types_lock);
3090 3123
3091 /* trace pipe does not show start of buffer */ 3124 /* trace pipe does not show start of buffer */
3092 cpus_setall(iter->started); 3125 cpumask_setall(iter->started);
3093 3126
3094 iter->tr = &global_trace; 3127 iter->tr = &global_trace;
3095 iter->trace = current_trace; 3128 iter->trace = current_trace;
@@ -3106,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3106{ 3139{
3107 struct trace_iterator *iter = file->private_data; 3140 struct trace_iterator *iter = file->private_data;
3108 3141
3142 free_cpumask_var(iter->started);
3109 kfree(iter); 3143 kfree(iter);
3110 atomic_dec(&tracing_reader); 3144 atomic_dec(&tracing_reader);
3111 3145
@@ -3724,7 +3758,6 @@ void ftrace_dump(void)
3724 static DEFINE_SPINLOCK(ftrace_dump_lock); 3758 static DEFINE_SPINLOCK(ftrace_dump_lock);
3725 /* use static because iter can be a bit big for the stack */ 3759 /* use static because iter can be a bit big for the stack */
3726 static struct trace_iterator iter; 3760 static struct trace_iterator iter;
3727 static cpumask_t mask;
3728 static int dump_ran; 3761 static int dump_ran;
3729 unsigned long flags; 3762 unsigned long flags;
3730 int cnt = 0, cpu; 3763 int cnt = 0, cpu;
@@ -3758,8 +3791,6 @@ void ftrace_dump(void)
3758 * and then release the locks again. 3791 * and then release the locks again.
3759 */ 3792 */
3760 3793
3761 cpus_clear(mask);
3762
3763 while (!trace_empty(&iter)) { 3794 while (!trace_empty(&iter)) {
3764 3795
3765 if (!cnt) 3796 if (!cnt)
@@ -3795,19 +3826,28 @@ __init static int tracer_alloc_buffers(void)
3795{ 3826{
3796 struct trace_array_cpu *data; 3827 struct trace_array_cpu *data;
3797 int i; 3828 int i;
3829 int ret = -ENOMEM;
3798 3830
3799 /* TODO: make the number of buffers hot pluggable with CPUS */ 3831 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
3800 tracing_buffer_mask = cpu_possible_map; 3832 goto out;
3833
3834 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3835 goto out_free_buffer_mask;
3801 3836
3837 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3838 cpumask_copy(tracing_cpumask, cpu_all_mask);
3839
3840 /* TODO: make the number of buffers hot pluggable with CPUS */
3802 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 3841 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3803 TRACE_BUFFER_FLAGS); 3842 TRACE_BUFFER_FLAGS);
3804 if (!global_trace.buffer) { 3843 if (!global_trace.buffer) {
3805 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 3844 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3806 WARN_ON(1); 3845 WARN_ON(1);
3807 return 0; 3846 goto out_free_cpumask;
3808 } 3847 }
3809 global_trace.entries = ring_buffer_size(global_trace.buffer); 3848 global_trace.entries = ring_buffer_size(global_trace.buffer);
3810 3849
3850
3811#ifdef CONFIG_TRACER_MAX_TRACE 3851#ifdef CONFIG_TRACER_MAX_TRACE
3812 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 3852 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3813 TRACE_BUFFER_FLAGS); 3853 TRACE_BUFFER_FLAGS);
@@ -3815,7 +3855,7 @@ __init static int tracer_alloc_buffers(void)
3815 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 3855 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3816 WARN_ON(1); 3856 WARN_ON(1);
3817 ring_buffer_free(global_trace.buffer); 3857 ring_buffer_free(global_trace.buffer);
3818 return 0; 3858 goto out_free_cpumask;
3819 } 3859 }
3820 max_tr.entries = ring_buffer_size(max_tr.buffer); 3860 max_tr.entries = ring_buffer_size(max_tr.buffer);
3821 WARN_ON(max_tr.entries != global_trace.entries); 3861 WARN_ON(max_tr.entries != global_trace.entries);
@@ -3845,8 +3885,14 @@ __init static int tracer_alloc_buffers(void)
3845 &trace_panic_notifier); 3885 &trace_panic_notifier);
3846 3886
3847 register_die_notifier(&trace_die_notifier); 3887 register_die_notifier(&trace_die_notifier);
3888 ret = 0;
3848 3889
3849 return 0; 3890out_free_cpumask:
3891 free_cpumask_var(tracing_cpumask);
3892out_free_buffer_mask:
3893 free_cpumask_var(tracing_buffer_mask);
3894out:
3895 return ret;
3850} 3896}
3851early_initcall(tracer_alloc_buffers); 3897early_initcall(tracer_alloc_buffers);
3852fs_initcall(tracer_init_debugfs); 3898fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5ac697065a48..4d3d381bfd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,7 +28,7 @@ enum trace_type {
28 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 30 TRACE_USER_STACK,
31 TRACE_BTS, 31 TRACE_HW_BRANCHES,
32 TRACE_POWER, 32 TRACE_POWER,
33 33
34 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
@@ -159,10 +159,10 @@ struct trace_branch {
159 char correct; 159 char correct;
160}; 160};
161 161
162struct bts_entry { 162struct hw_branch_entry {
163 struct trace_entry ent; 163 struct trace_entry ent;
164 unsigned long from; 164 u64 from;
165 unsigned long to; 165 u64 to;
166}; 166};
167 167
168struct trace_power { 168struct trace_power {
@@ -278,7 +278,7 @@ extern void __ftrace_bad_type(void);
278 TRACE_GRAPH_ENT); \ 278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \ 280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ 281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ 282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
283 __ftrace_bad_type(); \ 283 __ftrace_bad_type(); \
284 } while (0) 284 } while (0)
@@ -368,12 +368,13 @@ struct trace_iterator {
368 loff_t pos; 368 loff_t pos;
369 long idx; 369 long idx;
370 370
371 cpumask_t started; 371 cpumask_var_t started;
372}; 372};
373 373
374int tracing_is_enabled(void); 374int tracing_is_enabled(void);
375void trace_wake_up(void); 375void trace_wake_up(void);
376void tracing_reset(struct trace_array *tr, int cpu); 376void tracing_reset(struct trace_array *tr, int cpu);
377void tracing_reset_online_cpus(struct trace_array *tr);
377int tracing_open_generic(struct inode *inode, struct file *filp); 378int tracing_open_generic(struct inode *inode, struct file *filp);
378struct dentry *tracing_init_dentry(void); 379struct dentry *tracing_init_dentry(void);
379void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 380void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
@@ -414,9 +415,7 @@ void trace_function(struct trace_array *tr,
414 415
415void trace_graph_return(struct ftrace_graph_ret *trace); 416void trace_graph_return(struct ftrace_graph_ret *trace);
416int trace_graph_entry(struct ftrace_graph_ent *trace); 417int trace_graph_entry(struct ftrace_graph_ent *trace);
417void trace_bts(struct trace_array *tr, 418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
418 unsigned long from,
419 unsigned long to);
420 419
421void tracing_start_cmdline_record(void); 420void tracing_start_cmdline_record(void);
422void tracing_stop_cmdline_record(void); 421void tracing_stop_cmdline_record(void);
@@ -580,7 +579,8 @@ enum trace_iterator_flags {
580 TRACE_ITER_BRANCH = 0x1000, 579 TRACE_ITER_BRANCH = 0x1000,
581 TRACE_ITER_ANNOTATE = 0x2000, 580 TRACE_ITER_ANNOTATE = 0x2000,
582 TRACE_ITER_USERSTACKTRACE = 0x4000, 581 TRACE_ITER_USERSTACKTRACE = 0x4000,
583 TRACE_ITER_SYM_USEROBJ = 0x8000 582 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000
584}; 584};
585 585
586/* 586/*
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a4fa2c57e34e..366c8c333e13 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -37,22 +37,12 @@ void disable_boot_trace(void)
37 tracing_stop_sched_switch_record(); 37 tracing_stop_sched_switch_record();
38} 38}
39 39
40static void reset_boot_trace(struct trace_array *tr)
41{
42 int cpu;
43
44 tr->time_start = ftrace_now(tr->cpu);
45
46 for_each_online_cpu(cpu)
47 tracing_reset(tr, cpu);
48}
49
50static int boot_trace_init(struct trace_array *tr) 40static int boot_trace_init(struct trace_array *tr)
51{ 41{
52 int cpu; 42 int cpu;
53 boot_trace = tr; 43 boot_trace = tr;
54 44
55 for_each_cpu_mask(cpu, cpu_possible_map) 45 for_each_cpu(cpu, cpu_possible_mask)
56 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
57 47
58 tracing_sched_switch_assign_trace(tr); 48 tracing_sched_switch_assign_trace(tr);
@@ -130,7 +120,7 @@ struct tracer boot_tracer __read_mostly =
130{ 120{
131 .name = "initcall", 121 .name = "initcall",
132 .init = boot_trace_init, 122 .init = boot_trace_init,
133 .reset = reset_boot_trace, 123 .reset = tracing_reset_online_cpus,
134 .print_line = initcall_print_line, 124 .print_line = initcall_print_line,
135}; 125};
136 126
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
deleted file mode 100644
index 23b76e4690ef..000000000000
--- a/kernel/trace/trace_bts.c
+++ /dev/null
@@ -1,276 +0,0 @@
1/*
2 * BTS tracer
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28/*
29 * Information to interpret a BTS record.
30 * This will go into an in-kernel BTS interface.
31 */
32static unsigned char sizeof_field;
33static unsigned long debugctl_mask;
34
35#define sizeof_bts (3 * sizeof_field)
36
37static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
38{
39 switch (c->x86) {
40 case 0x6:
41 switch (c->x86_model) {
42 case 0x0 ... 0xC:
43 break;
44 case 0xD:
45 case 0xE: /* Pentium M */
46 sizeof_field = sizeof(long);
47 debugctl_mask = (1<<6)|(1<<7);
48 break;
49 default:
50 sizeof_field = 8;
51 debugctl_mask = (1<<6)|(1<<7);
52 break;
53 }
54 break;
55 case 0xF:
56 switch (c->x86_model) {
57 case 0x0:
58 case 0x1:
59 case 0x2: /* Netburst */
60 sizeof_field = sizeof(long);
61 debugctl_mask = (1<<2)|(1<<3);
62 break;
63 default:
64 /* sorry, don't know about them */
65 break;
66 }
67 break;
68 default:
69 /* sorry, don't know about them */
70 break;
71 }
72}
73
74static inline void bts_enable(void)
75{
76 unsigned long debugctl;
77
78 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
79 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
80}
81
82static inline void bts_disable(void)
83{
84 unsigned long debugctl;
85
86 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
87 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
88}
89
90static void bts_trace_reset(struct trace_array *tr)
91{
92 int cpu;
93
94 tr->time_start = ftrace_now(tr->cpu);
95
96 for_each_online_cpu(cpu)
97 tracing_reset(tr, cpu);
98}
99
100static void bts_trace_start_cpu(void *arg)
101{
102 this_tracer =
103 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
104 /* ovfl = */ NULL, /* th = */ (size_t)-1);
105 if (IS_ERR(this_tracer)) {
106 this_tracer = NULL;
107 return;
108 }
109
110 bts_enable();
111}
112
113static void bts_trace_start(struct trace_array *tr)
114{
115 int cpu;
116
117 bts_trace_reset(tr);
118
119 for_each_cpu_mask(cpu, cpu_possible_map)
120 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
121}
122
123static void bts_trace_stop_cpu(void *arg)
124{
125 if (this_tracer) {
126 bts_disable();
127
128 ds_release_bts(this_tracer);
129 this_tracer = NULL;
130 }
131}
132
133static void bts_trace_stop(struct trace_array *tr)
134{
135 int cpu;
136
137 for_each_cpu_mask(cpu, cpu_possible_map)
138 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
139}
140
141static int bts_trace_init(struct trace_array *tr)
142{
143 bts_trace_cpuinit(&boot_cpu_data);
144 bts_trace_reset(tr);
145 bts_trace_start(tr);
146
147 return 0;
148}
149
150static void bts_trace_print_header(struct seq_file *m)
151{
152#ifdef __i386__
153 seq_puts(m, "# CPU# FROM TO FUNCTION\n");
154 seq_puts(m, "# | | | |\n");
155#else
156 seq_puts(m,
157 "# CPU# FROM TO FUNCTION\n");
158 seq_puts(m,
159 "# | | | |\n");
160#endif
161}
162
163static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
164{
165 struct trace_entry *entry = iter->ent;
166 struct trace_seq *seq = &iter->seq;
167 struct bts_entry *it;
168
169 trace_assign_type(it, entry);
170
171 if (entry->type == TRACE_BTS) {
172 int ret;
173#ifdef CONFIG_KALLSYMS
174 char function[KSYM_SYMBOL_LEN];
175 sprint_symbol(function, it->from);
176#else
177 char *function = "<unknown>";
178#endif
179
180 ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n",
181 entry->cpu, it->from, it->to, function);
182 if (!ret)
183 return TRACE_TYPE_PARTIAL_LINE;;
184 return TRACE_TYPE_HANDLED;
185 }
186 return TRACE_TYPE_UNHANDLED;
187}
188
189void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
190{
191 struct ring_buffer_event *event;
192 struct bts_entry *entry;
193 unsigned long irq;
194
195 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
196 if (!event)
197 return;
198 entry = ring_buffer_event_data(event);
199 tracing_generic_entry_update(&entry->ent, 0, from);
200 entry->ent.type = TRACE_BTS;
201 entry->ent.cpu = smp_processor_id();
202 entry->from = from;
203 entry->to = to;
204 ring_buffer_unlock_commit(tr->buffer, event, irq);
205}
206
207static void trace_bts_at(struct trace_array *tr, size_t index)
208{
209 const void *raw = NULL;
210 unsigned long from, to;
211 int err;
212
213 err = ds_access_bts(this_tracer, index, &raw);
214 if (err < 0)
215 return;
216
217 from = *(const unsigned long *)raw;
218 to = *(const unsigned long *)((const char *)raw + sizeof_field);
219
220 trace_bts(tr, from, to);
221}
222
223static void trace_bts_cpu(void *arg)
224{
225 struct trace_array *tr = (struct trace_array *) arg;
226 size_t index = 0, end = 0, i;
227 int err;
228
229 if (!this_tracer)
230 return;
231
232 bts_disable();
233
234 err = ds_get_bts_index(this_tracer, &index);
235 if (err < 0)
236 goto out;
237
238 err = ds_get_bts_end(this_tracer, &end);
239 if (err < 0)
240 goto out;
241
242 for (i = index; i < end; i++)
243 trace_bts_at(tr, i);
244
245 for (i = 0; i < index; i++)
246 trace_bts_at(tr, i);
247
248out:
249 bts_enable();
250}
251
252static void trace_bts_prepare(struct trace_iterator *iter)
253{
254 int cpu;
255
256 for_each_cpu_mask(cpu, cpu_possible_map)
257 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
258}
259
260struct tracer bts_tracer __read_mostly =
261{
262 .name = "bts",
263 .init = bts_trace_init,
264 .reset = bts_trace_stop,
265 .print_header = bts_trace_print_header,
266 .print_line = bts_trace_print_line,
267 .start = bts_trace_start,
268 .stop = bts_trace_stop,
269 .open = trace_bts_prepare
270};
271
272__init static int init_bts_trace(void)
273{
274 return register_tracer(&bts_tracer);
275}
276device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e74f6d0a3216..9236d7e25a16 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
16 16
17#include "trace.h" 17#include "trace.h"
18 18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr, cpu);
27}
28
29static void start_function_trace(struct trace_array *tr) 19static void start_function_trace(struct trace_array *tr)
30{ 20{
31 tr->cpu = get_cpu(); 21 tr->cpu = get_cpu();
32 function_reset(tr); 22 tracing_reset_online_cpus(tr);
33 put_cpu(); 23 put_cpu();
34 24
35 tracing_start_cmdline_record(); 25 tracing_start_cmdline_record();
@@ -55,7 +45,7 @@ static void function_trace_reset(struct trace_array *tr)
55 45
56static void function_trace_start(struct trace_array *tr) 46static void function_trace_start(struct trace_array *tr)
57{ 47{
58 function_reset(tr); 48 tracing_reset_online_cpus(tr);
59} 49}
60 50
61static struct tracer function_trace __read_mostly = 51static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index af60eef4cbcc..930c08e5b38e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu)
79 int i; 79 int i;
80 int ret; 80 int ret;
81 int log10_this = log10_cpu(cpu); 81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); 82 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
83 83
84 84
85 /* 85 /*
@@ -231,6 +231,49 @@ trace_branch_is_leaf(struct trace_iterator *iter,
231 return true; 231 return true;
232} 232}
233 233
234static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid)
237{
238 int ret;
239
240 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED;
243
244 if (type == TRACE_GRAPH_ENT) {
245 ret = trace_seq_printf(s, "==========> | ");
246 } else {
247 /* Cpu */
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
249 ret = print_graph_cpu(s, cpu);
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258
259 ret = trace_seq_printf(s, " | ");
260 if (!ret)
261 return TRACE_TYPE_PARTIAL_LINE;
262 }
263
264 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
266 ret = trace_seq_printf(s, " ");
267 if (!ret)
268 return TRACE_TYPE_PARTIAL_LINE;
269 }
270
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED;
276}
234 277
235static enum print_line_t 278static enum print_line_t
236print_graph_duration(unsigned long long duration, struct trace_seq *s) 279print_graph_duration(unsigned long long duration, struct trace_seq *s)
@@ -344,7 +387,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
344 387
345static enum print_line_t 388static enum print_line_t
346print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, 389print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
347 struct trace_seq *s) 390 struct trace_seq *s, pid_t pid, int cpu)
348{ 391{
349 int i; 392 int i;
350 int ret; 393 int ret;
@@ -357,8 +400,18 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
357 return TRACE_TYPE_PARTIAL_LINE; 400 return TRACE_TYPE_PARTIAL_LINE;
358 } 401 }
359 402
360 /* No time */ 403 /* Interrupt */
361 ret = trace_seq_printf(s, " | "); 404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
405 if (ret == TRACE_TYPE_UNHANDLED) {
406 /* No time */
407 ret = trace_seq_printf(s, " | ");
408 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 }
414
362 415
363 /* Function */ 416 /* Function */
364 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -410,7 +463,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
410 if (trace_branch_is_leaf(iter, field)) 463 if (trace_branch_is_leaf(iter, field))
411 return print_graph_entry_leaf(iter, field, s); 464 return print_graph_entry_leaf(iter, field, s);
412 else 465 else
413 return print_graph_entry_nested(field, s); 466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
414 467
415} 468}
416 469
@@ -474,6 +527,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
474 if (!ret) 527 if (!ret)
475 return TRACE_TYPE_PARTIAL_LINE; 528 return TRACE_TYPE_PARTIAL_LINE;
476 } 529 }
530
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE;
534
477 return TRACE_TYPE_HANDLED; 535 return TRACE_TYPE_HANDLED;
478} 536}
479 537
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 000000000000..649df22d435f
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,195 @@
1/*
2 * h/w branch tracer for x86 based on bts
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28static void bts_trace_start_cpu(void *arg)
29{
30 if (this_tracer)
31 ds_release_bts(this_tracer);
32
33 this_tracer =
34 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
35 /* ovfl = */ NULL, /* th = */ (size_t)-1,
36 BTS_KERNEL);
37 if (IS_ERR(this_tracer)) {
38 this_tracer = NULL;
39 return;
40 }
41}
42
43static void bts_trace_start(struct trace_array *tr)
44{
45 int cpu;
46
47 tracing_reset_online_cpus(tr);
48
49 for_each_cpu(cpu, cpu_possible_mask)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51}
52
53static void bts_trace_stop_cpu(void *arg)
54{
55 if (this_tracer) {
56 ds_release_bts(this_tracer);
57 this_tracer = NULL;
58 }
59}
60
61static void bts_trace_stop(struct trace_array *tr)
62{
63 int cpu;
64
65 for_each_cpu(cpu, cpu_possible_mask)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67}
68
69static int bts_trace_init(struct trace_array *tr)
70{
71 tracing_reset_online_cpus(tr);
72 bts_trace_start(tr);
73
74 return 0;
75}
76
77static void bts_trace_print_header(struct seq_file *m)
78{
79 seq_puts(m,
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83}
84
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
86{
87 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it;
90
91 trace_assign_type(it, entry);
92
93 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
96 it->from, it->to) &&
97 (!it->from ||
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;;
102 }
103 return TRACE_TYPE_UNHANDLED;
104}
105
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
107{
108 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry;
110 unsigned long irq;
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
113 if (!event)
114 return;
115 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from;
120 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq);
122}
123
124static void trace_bts_at(struct trace_array *tr,
125 const struct bts_trace *trace, void *at)
126{
127 struct bts_struct bts;
128 int err = 0;
129
130 WARN_ON_ONCE(!trace->read);
131 if (!trace->read)
132 return;
133
134 err = trace->read(this_tracer, at, &bts);
135 if (err < 0)
136 return;
137
138 switch (bts.qualifier) {
139 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
141 break;
142 }
143}
144
145static void trace_bts_cpu(void *arg)
146{
147 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace;
149 unsigned char *at;
150
151 if (!this_tracer)
152 return;
153
154 ds_suspend_bts(this_tracer);
155 trace = ds_read_bts(this_tracer);
156 if (!trace)
157 goto out;
158
159 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size)
161 trace_bts_at(tr, trace, at);
162
163 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size)
165 trace_bts_at(tr, trace, at);
166
167out:
168 ds_resume_bts(this_tracer);
169}
170
171static void trace_bts_prepare(struct trace_iterator *iter)
172{
173 int cpu;
174
175 for_each_cpu(cpu, cpu_possible_mask)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177}
178
179struct tracer bts_tracer __read_mostly =
180{
181 .name = "hw-branch-tracer",
182 .init = bts_trace_init,
183 .reset = bts_trace_stop,
184 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line,
186 .start = bts_trace_start,
187 .stop = bts_trace_stop,
188 .open = trace_bts_prepare
189};
190
191__init static int init_bts_trace(void)
192{
193 return register_tracer(&bts_tracer);
194}
195device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2fb6da6523b3..fffcb069f1dc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -22,14 +22,10 @@ static unsigned long prev_overruns;
22 22
23static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
24{ 24{
25 int cpu;
26
27 overrun_detected = false; 25 overrun_detected = false;
28 prev_overruns = 0; 26 prev_overruns = 0;
29 tr->time_start = ftrace_now(tr->cpu);
30 27
31 for_each_online_cpu(cpu) 28 tracing_reset_online_cpus(tr);
32 tracing_reset(tr, cpu);
33} 29}
34 30
35static int mmio_trace_init(struct trace_array *tr) 31static int mmio_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..7bda248daf55 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr)
39 39
40 trace_power_enabled = 1; 40 trace_power_enabled = 1;
41 41
42 for_each_cpu_mask(cpu, cpu_possible_map) 42 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 43 tracing_reset(tr, cpu);
44 return 0; 44 return 0;
45} 45}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 863390557b44..df175cb4564f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -49,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49} 49}
50 50
51static void 51static void
52probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) 52probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
53{ 53{
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
@@ -72,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
72 local_irq_restore(flags); 72 local_irq_restore(flags);
73} 73}
74 74
75static void sched_switch_reset(struct trace_array *tr)
76{
77 int cpu;
78
79 tr->time_start = ftrace_now(tr->cpu);
80
81 for_each_online_cpu(cpu)
82 tracing_reset(tr, cpu);
83}
84
85static int tracing_sched_register(void) 75static int tracing_sched_register(void)
86{ 76{
87 int ret; 77 int ret;
@@ -197,7 +187,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
197 187
198static void start_sched_trace(struct trace_array *tr) 188static void start_sched_trace(struct trace_array *tr)
199{ 189{
200 sched_switch_reset(tr); 190 tracing_reset_online_cpus(tr);
201 tracing_start_sched_switch_record(); 191 tracing_start_sched_switch_record();
202} 192}
203 193
@@ -221,7 +211,7 @@ static void sched_switch_trace_reset(struct trace_array *tr)
221 211
222static void sched_switch_trace_start(struct trace_array *tr) 212static void sched_switch_trace_start(struct trace_array *tr)
223{ 213{
224 sched_switch_reset(tr); 214 tracing_reset_online_cpus(tr);
225 tracing_start_sched_switch(); 215 tracing_start_sched_switch();
226} 216}
227 217
@@ -247,3 +237,4 @@ __init static int init_sched_switch_trace(void)
247 return register_tracer(&sched_switch_trace); 237 return register_tracer(&sched_switch_trace);
248} 238}
249device_initcall(init_sched_switch_trace); 239device_initcall(init_sched_switch_trace);
240
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0067b49746c1..43586b689e31 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -211,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
211} 211}
212 212
213static void 213static void
214probe_wakeup(struct rq *rq, struct task_struct *p) 214probe_wakeup(struct rq *rq, struct task_struct *p, int success)
215{ 215{
216 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
217 unsigned long flags; 217 unsigned long flags;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0b863f2cbc8e..d0871bc0aca5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/sysctl.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/fs.h> 15#include <linux/fs.h>
15#include "trace.h" 16#include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
31 32
32static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
33static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
35static DEFINE_MUTEX(stack_sysctl_mutex);
36
37int stack_tracer_enabled;
38static int last_stack_tracer_enabled;
34 39
35static inline void check_stack(void) 40static inline void check_stack(void)
36{ 41{
@@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
174 return count; 179 return count;
175} 180}
176 181
177static struct file_operations stack_max_size_fops = { 182static const struct file_operations stack_max_size_fops = {
178 .open = tracing_open_generic, 183 .open = tracing_open_generic,
179 .read = stack_max_size_read, 184 .read = stack_max_size_read,
180 .write = stack_max_size_write, 185 .write = stack_max_size_write,
@@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
272 return 0; 277 return 0;
273} 278}
274 279
275static struct seq_operations stack_trace_seq_ops = { 280static const struct seq_operations stack_trace_seq_ops = {
276 .start = t_start, 281 .start = t_start,
277 .next = t_next, 282 .next = t_next,
278 .stop = t_stop, 283 .stop = t_stop,
@@ -288,12 +293,47 @@ static int stack_trace_open(struct inode *inode, struct file *file)
288 return ret; 293 return ret;
289} 294}
290 295
291static struct file_operations stack_trace_fops = { 296static const struct file_operations stack_trace_fops = {
292 .open = stack_trace_open, 297 .open = stack_trace_open,
293 .read = seq_read, 298 .read = seq_read,
294 .llseek = seq_lseek, 299 .llseek = seq_lseek,
295}; 300};
296 301
302int
303stack_trace_sysctl(struct ctl_table *table, int write,
304 struct file *file, void __user *buffer, size_t *lenp,
305 loff_t *ppos)
306{
307 int ret;
308
309 mutex_lock(&stack_sysctl_mutex);
310
311 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
312
313 if (ret || !write ||
314 (last_stack_tracer_enabled == stack_tracer_enabled))
315 goto out;
316
317 last_stack_tracer_enabled = stack_tracer_enabled;
318
319 if (stack_tracer_enabled)
320 register_ftrace_function(&trace_ops);
321 else
322 unregister_ftrace_function(&trace_ops);
323
324 out:
325 mutex_unlock(&stack_sysctl_mutex);
326 return ret;
327}
328
329static __init int enable_stacktrace(char *str)
330{
331 stack_tracer_enabled = 1;
332 last_stack_tracer_enabled = 1;
333 return 1;
334}
335__setup("stacktrace", enable_stacktrace);
336
297static __init int stack_trace_init(void) 337static __init int stack_trace_init(void)
298{ 338{
299 struct dentry *d_tracer; 339 struct dentry *d_tracer;
@@ -311,7 +351,8 @@ static __init int stack_trace_init(void)
311 if (!entry) 351 if (!entry)
312 pr_warning("Could not create debugfs 'stack_trace' entry\n"); 352 pr_warning("Could not create debugfs 'stack_trace' entry\n");
313 353
314 register_ftrace_function(&trace_ops); 354 if (stack_tracer_enabled)
355 register_ftrace_function(&trace_ops);
315 356
316 return 0; 357 return 0;
317} 358}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 54960edb96d0..eaca5ad803ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,27 +196,19 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
196 return HRTIMER_RESTART; 196 return HRTIMER_RESTART;
197} 197}
198 198
199static void start_stack_timer(int cpu) 199static void start_stack_timer(void *unused)
200{ 200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); 201 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}
209 208
210static void start_stack_timers(void) 209static void start_stack_timers(void)
211{ 210{
212 cpumask_t saved_mask = current->cpus_allowed; 211 on_each_cpu(start_stack_timer, NULL, 1);
213 int cpu;
214
215 for_each_online_cpu(cpu) {
216 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
217 start_stack_timer(cpu);
218 }
219 set_cpus_allowed_ptr(current, &saved_mask);
220} 212}
221 213
222static void stop_stack_timer(int cpu) 214static void stop_stack_timer(int cpu)
@@ -234,20 +226,10 @@ static void stop_stack_timers(void)
234 stop_stack_timer(cpu); 226 stop_stack_timer(cpu);
235} 227}
236 228
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr, cpu);
245}
246
247static void start_stack_trace(struct trace_array *tr) 229static void start_stack_trace(struct trace_array *tr)
248{ 230{
249 mutex_lock(&sample_timer_lock); 231 mutex_lock(&sample_timer_lock);
250 stack_reset(tr); 232 tracing_reset_online_cpus(tr);
251 start_stack_timers(); 233 start_stack_timers();
252 tracer_enabled = 1; 234 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock); 235 mutex_unlock(&sample_timer_lock);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532dfb..2dc06ab35716 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
27 */ 27 */
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 const struct cred *tcred;
30 struct timespec uptime, ts; 31 struct timespec uptime, ts;
31 u64 ac_etime; 32 u64 ac_etime;
32 33
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
53 stats->ac_flag |= AXSIG; 54 stats->ac_flag |= AXSIG;
54 stats->ac_nice = task_nice(tsk); 55 stats->ac_nice = task_nice(tsk);
55 stats->ac_sched = tsk->policy; 56 stats->ac_sched = tsk->policy;
56 stats->ac_uid = tsk->uid;
57 stats->ac_gid = tsk->gid;
58 stats->ac_pid = tsk->pid; 57 stats->ac_pid = tsk->pid;
59 rcu_read_lock(); 58 rcu_read_lock();
59 tcred = __task_cred(tsk);
60 stats->ac_uid = tcred->uid;
61 stats->ac_gid = tcred->gid;
60 stats->ac_ppid = pid_alive(tsk) ? 62 stats->ac_ppid = pid_alive(tsk) ?
61 rcu_dereference(tsk->real_parent)->tgid : 0; 63 rcu_dereference(tsk->real_parent)->tgid : 0;
62 rcu_read_unlock(); 64 rcu_read_unlock();
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2f..2460c3199b5a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
86{ 86{
87 const struct cred *cred = current_cred();
87 int retval; 88 int retval;
88 89
89 if (!(retval = put_user(high2lowuid(current->uid), ruid)) && 90 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
90 !(retval = put_user(high2lowuid(current->euid), euid))) 91 !(retval = put_user(high2lowuid(cred->euid), euid)))
91 retval = put_user(high2lowuid(current->suid), suid); 92 retval = put_user(high2lowuid(cred->suid), suid);
92 93
93 return retval; 94 return retval;
94} 95}
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
104 105
105asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
106{ 107{
108 const struct cred *cred = current_cred();
107 int retval; 109 int retval;
108 110
109 if (!(retval = put_user(high2lowgid(current->gid), rgid)) && 111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
110 !(retval = put_user(high2lowgid(current->egid), egid))) 112 !(retval = put_user(high2lowgid(cred->egid), egid)))
111 retval = put_user(high2lowgid(current->sgid), sgid); 113 retval = put_user(high2lowgid(cred->sgid), sgid);
112 114
113 return retval; 115 return retval;
114} 116}
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
161 163
162asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
163{ 165{
164 int i = 0; 166 const struct cred *cred = current_cred();
167 int i;
165 168
166 if (gidsetsize < 0) 169 if (gidsetsize < 0)
167 return -EINVAL; 170 return -EINVAL;
168 171
169 get_group_info(current->group_info); 172 i = cred->group_info->ngroups;
170 i = current->group_info->ngroups;
171 if (gidsetsize) { 173 if (gidsetsize) {
172 if (i > gidsetsize) { 174 if (i > gidsetsize) {
173 i = -EINVAL; 175 i = -EINVAL;
174 goto out; 176 goto out;
175 } 177 }
176 if (groups16_to_user(grouplist, current->group_info)) { 178 if (groups16_to_user(grouplist, cred->group_info)) {
177 i = -EFAULT; 179 i = -EFAULT;
178 goto out; 180 goto out;
179 } 181 }
180 } 182 }
181out: 183out:
182 put_group_info(current->group_info);
183 return i; 184 return i;
184} 185}
185 186
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
210 211
211asmlinkage long sys_getuid16(void) 212asmlinkage long sys_getuid16(void)
212{ 213{
213 return high2lowuid(current->uid); 214 return high2lowuid(current_uid());
214} 215}
215 216
216asmlinkage long sys_geteuid16(void) 217asmlinkage long sys_geteuid16(void)
217{ 218{
218 return high2lowuid(current->euid); 219 return high2lowuid(current_euid());
219} 220}
220 221
221asmlinkage long sys_getgid16(void) 222asmlinkage long sys_getgid16(void)
222{ 223{
223 return high2lowgid(current->gid); 224 return high2lowgid(current_gid());
224} 225}
225 226
226asmlinkage long sys_getegid16(void) 227asmlinkage long sys_getegid16(void)
227{ 228{
228 return high2lowgid(current->egid); 229 return high2lowgid(current_egid());
229} 230}
diff --git a/kernel/user.c b/kernel/user.c
index cec2224bc9f5..477b6660f447 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,12 +16,13 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
19 20
20struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
21 .kref = { 22 .kref = {
22 .refcount = ATOMIC_INIT(2), 23 .refcount = ATOMIC_INIT(1),
23 }, 24 },
24 .root_user = &root_user, 25 .creator = &root_user,
25}; 26};
26EXPORT_SYMBOL_GPL(init_user_ns); 27EXPORT_SYMBOL_GPL(init_user_ns);
27 28
@@ -47,12 +48,14 @@ static struct kmem_cache *uid_cachep;
47 */ 48 */
48static DEFINE_SPINLOCK(uidhash_lock); 49static DEFINE_SPINLOCK(uidhash_lock);
49 50
51/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
50struct user_struct root_user = { 52struct user_struct root_user = {
51 .__count = ATOMIC_INIT(1), 53 .__count = ATOMIC_INIT(2),
52 .processes = ATOMIC_INIT(1), 54 .processes = ATOMIC_INIT(1),
53 .files = ATOMIC_INIT(0), 55 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns,
56#ifdef CONFIG_USER_SCHED 59#ifdef CONFIG_USER_SCHED
57 .tg = &init_task_group, 60 .tg = &init_task_group,
58#endif 61#endif
@@ -106,16 +109,10 @@ static int sched_create_user(struct user_struct *up)
106 return rc; 109 return rc;
107} 110}
108 111
109static void sched_switch_user(struct task_struct *p)
110{
111 sched_move_task(p);
112}
113
114#else /* CONFIG_USER_SCHED */ 112#else /* CONFIG_USER_SCHED */
115 113
116static void sched_destroy_user(struct user_struct *up) { } 114static void sched_destroy_user(struct user_struct *up) { }
117static int sched_create_user(struct user_struct *up) { return 0; } 115static int sched_create_user(struct user_struct *up) { return 0; }
118static void sched_switch_user(struct task_struct *p) { }
119 116
120#endif /* CONFIG_USER_SCHED */ 117#endif /* CONFIG_USER_SCHED */
121 118
@@ -244,13 +241,21 @@ static struct kobj_type uids_ktype = {
244 .release = uids_release, 241 .release = uids_release,
245}; 242};
246 243
247/* create /sys/kernel/uids/<uid>/cpu_share file for this user */ 244/*
245 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
246 * We do not create this file for users in a user namespace (until
247 * sysfs tagging is implemented).
248 *
249 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
250 */
248static int uids_user_create(struct user_struct *up) 251static int uids_user_create(struct user_struct *up)
249{ 252{
250 struct kobject *kobj = &up->kobj; 253 struct kobject *kobj = &up->kobj;
251 int error; 254 int error;
252 255
253 memset(kobj, 0, sizeof(struct kobject)); 256 memset(kobj, 0, sizeof(struct kobject));
257 if (up->user_ns != &init_user_ns)
258 return 0;
254 kobj->kset = uids_kset; 259 kobj->kset = uids_kset;
255 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); 260 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
256 if (error) { 261 if (error) {
@@ -286,6 +291,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
286 unsigned long flags; 291 unsigned long flags;
287 int remove_user = 0; 292 int remove_user = 0;
288 293
294 if (up->user_ns != &init_user_ns)
295 return;
289 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
290 * atomic. 297 * atomic.
291 */ 298 */
@@ -321,12 +328,13 @@ done:
321 * IRQ state (as stored in flags) is restored and uidhash_lock released 328 * IRQ state (as stored in flags) is restored and uidhash_lock released
322 * upon function exit. 329 * upon function exit.
323 */ 330 */
324static inline void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
325{ 332{
326 /* restore back the count */ 333 /* restore back the count */
327 atomic_inc(&up->__count); 334 atomic_inc(&up->__count);
328 spin_unlock_irqrestore(&uidhash_lock, flags); 335 spin_unlock_irqrestore(&uidhash_lock, flags);
329 336
337 put_user_ns(up->user_ns);
330 INIT_WORK(&up->work, remove_user_sysfs_dir); 338 INIT_WORK(&up->work, remove_user_sysfs_dir);
331 schedule_work(&up->work); 339 schedule_work(&up->work);
332} 340}
@@ -342,13 +350,14 @@ static inline void uids_mutex_unlock(void) { }
342 * IRQ state (as stored in flags) is restored and uidhash_lock released 350 * IRQ state (as stored in flags) is restored and uidhash_lock released
343 * upon function exit. 351 * upon function exit.
344 */ 352 */
345static inline void free_user(struct user_struct *up, unsigned long flags) 353static void free_user(struct user_struct *up, unsigned long flags)
346{ 354{
347 uid_hash_remove(up); 355 uid_hash_remove(up);
348 spin_unlock_irqrestore(&uidhash_lock, flags); 356 spin_unlock_irqrestore(&uidhash_lock, flags);
349 sched_destroy_user(up); 357 sched_destroy_user(up);
350 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
351 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
352 kmem_cache_free(uid_cachep, up); 361 kmem_cache_free(uid_cachep, up);
353} 362}
354 363
@@ -364,7 +373,7 @@ struct user_struct *find_user(uid_t uid)
364{ 373{
365 struct user_struct *ret; 374 struct user_struct *ret;
366 unsigned long flags; 375 unsigned long flags;
367 struct user_namespace *ns = current->nsproxy->user_ns; 376 struct user_namespace *ns = current_user_ns();
368 377
369 spin_lock_irqsave(&uidhash_lock, flags); 378 spin_lock_irqsave(&uidhash_lock, flags);
370 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 379 ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -411,6 +420,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
411 if (sched_create_user(new) < 0) 420 if (sched_create_user(new) < 0)
412 goto out_free_user; 421 goto out_free_user;
413 422
423 new->user_ns = get_user_ns(ns);
424
414 if (uids_user_create(new)) 425 if (uids_user_create(new))
415 goto out_destoy_sched; 426 goto out_destoy_sched;
416 427
@@ -434,7 +445,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
434 up = new; 445 up = new;
435 } 446 }
436 spin_unlock_irq(&uidhash_lock); 447 spin_unlock_irq(&uidhash_lock);
437
438 } 448 }
439 449
440 uids_mutex_unlock(); 450 uids_mutex_unlock();
@@ -443,6 +453,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
443 453
444out_destoy_sched: 454out_destoy_sched:
445 sched_destroy_user(new); 455 sched_destroy_user(new);
456 put_user_ns(new->user_ns);
446out_free_user: 457out_free_user:
447 kmem_cache_free(uid_cachep, new); 458 kmem_cache_free(uid_cachep, new);
448out_unlock: 459out_unlock:
@@ -450,63 +461,6 @@ out_unlock:
450 return NULL; 461 return NULL;
451} 462}
452 463
453void switch_uid(struct user_struct *new_user)
454{
455 struct user_struct *old_user;
456
457 /* What if a process setreuid()'s and this brings the
458 * new uid over his NPROC rlimit? We can check this now
459 * cheaply with the new uid cache, so if it matters
460 * we should be checking for it. -DaveM
461 */
462 old_user = current->user;
463 atomic_inc(&new_user->processes);
464 atomic_dec(&old_user->processes);
465 switch_uid_keyring(new_user);
466 current->user = new_user;
467 sched_switch_user(current);
468
469 /*
470 * We need to synchronize with __sigqueue_alloc()
471 * doing a get_uid(p->user).. If that saw the old
472 * user value, we need to wait until it has exited
473 * its critical region before we can free the old
474 * structure.
475 */
476 smp_mb();
477 spin_unlock_wait(&current->sighand->siglock);
478
479 free_uid(old_user);
480 suid_keys(current);
481}
482
483#ifdef CONFIG_USER_NS
484void release_uids(struct user_namespace *ns)
485{
486 int i;
487 unsigned long flags;
488 struct hlist_head *head;
489 struct hlist_node *nd;
490
491 spin_lock_irqsave(&uidhash_lock, flags);
492 /*
493 * collapse the chains so that the user_struct-s will
494 * be still alive, but not in hashes. subsequent free_uid()
495 * will free them.
496 */
497 for (i = 0; i < UIDHASH_SZ; i++) {
498 head = ns->uidhash_table + i;
499 while (!hlist_empty(head)) {
500 nd = head->first;
501 hlist_del_init(nd);
502 }
503 }
504 spin_unlock_irqrestore(&uidhash_lock, flags);
505
506 free_uid(ns->root_user);
507}
508#endif
509
510static int __init uid_cache_init(void) 464static int __init uid_cache_init(void)
511{ 465{
512 int n; 466 int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b88..79084311ee57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,60 +9,55 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/cred.h>
12 13
13/* 14/*
14 * Clone a new ns copying an original user ns, setting refcount to 1 15 * Create a new user namespace, deriving the creator from the user in the
15 * @old_ns: namespace to clone 16 * passed credentials, and replacing that user with the new root user for the
16 * Return NULL on error (failure to kmalloc), new ns otherwise 17 * new namespace.
18 *
19 * This is called by copy_creds(), which will finish setting the target task's
20 * credentials.
17 */ 21 */
18static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) 22int create_user_ns(struct cred *new)
19{ 23{
20 struct user_namespace *ns; 24 struct user_namespace *ns;
21 struct user_struct *new_user; 25 struct user_struct *root_user;
22 int n; 26 int n;
23 27
24 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 28 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
25 if (!ns) 29 if (!ns)
26 return ERR_PTR(-ENOMEM); 30 return -ENOMEM;
27 31
28 kref_init(&ns->kref); 32 kref_init(&ns->kref);
29 33
30 for (n = 0; n < UIDHASH_SZ; ++n) 34 for (n = 0; n < UIDHASH_SZ; ++n)
31 INIT_HLIST_HEAD(ns->uidhash_table + n); 35 INIT_HLIST_HEAD(ns->uidhash_table + n);
32 36
33 /* Insert new root user. */ 37 /* Alloc new root user. */
34 ns->root_user = alloc_uid(ns, 0); 38 root_user = alloc_uid(ns, 0);
35 if (!ns->root_user) { 39 if (!root_user) {
36 kfree(ns); 40 kfree(ns);
37 return ERR_PTR(-ENOMEM); 41 return -ENOMEM;
38 } 42 }
39 43
40 /* Reset current->user with a new one */ 44 /* set the new root user in the credentials under preparation */
41 new_user = alloc_uid(ns, current->uid); 45 ns->creator = new->user;
42 if (!new_user) { 46 new->user = root_user;
43 free_uid(ns->root_user); 47 new->uid = new->euid = new->suid = new->fsuid = 0;
44 kfree(ns); 48 new->gid = new->egid = new->sgid = new->fsgid = 0;
45 return ERR_PTR(-ENOMEM); 49 put_group_info(new->group_info);
46 } 50 new->group_info = get_group_info(&init_groups);
47 51#ifdef CONFIG_KEYS
48 switch_uid(new_user); 52 key_put(new->request_key_auth);
49 return ns; 53 new->request_key_auth = NULL;
50} 54#endif
51 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
52struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
53{
54 struct user_namespace *new_ns;
55
56 BUG_ON(!old_ns);
57 get_user_ns(old_ns);
58
59 if (!(flags & CLONE_NEWUSER))
60 return old_ns;
61 56
62 new_ns = clone_user_ns(old_ns); 57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */
58 kref_set(&ns->kref, 1);
63 59
64 put_user_ns(old_ns); 60 return 0;
65 return new_ns;
66} 61}
67 62
68void free_user_ns(struct kref *kref) 63void free_user_ns(struct kref *kref)
@@ -70,7 +65,7 @@ void free_user_ns(struct kref *kref)
70 struct user_namespace *ns; 65 struct user_namespace *ns;
71 66
72 ns = container_of(kref, struct user_namespace, kref); 67 ns = container_of(kref, struct user_namespace, kref);
73 release_uids(ns); 68 free_uid(ns->creator);
74 kfree(ns); 69 kfree(ns);
75} 70}
76EXPORT_SYMBOL(free_user_ns); 71EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d4dc69ddebd7..2f445833ae37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 73static LIST_HEAD(workqueues);
74 74
75static int singlethread_cpu __read_mostly; 75static int singlethread_cpu __read_mostly;
76static cpumask_t cpu_singlethread_map __read_mostly; 76static const struct cpumask *cpu_singlethread_map __read_mostly;
77/* 77/*
78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,24 +81,24 @@ static cpumask_t cpu_singlethread_map __read_mostly;
81 * use cpu_possible_map, the cpumask below is more a documentation 81 * use cpu_possible_map, the cpumask below is more a documentation
82 * than optimization. 82 * than optimization.
83 */ 83 */
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_var_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
88{ 88{
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? cpu_singlethread_map : cpu_populated_map;
96} 96}
97 97
98static 98static
99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) 99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
100{ 100{
101 if (unlikely(is_single_threaded(wq))) 101 if (unlikely(is_wq_single_threaded(wq)))
102 cpu = singlethread_cpu; 102 cpu = singlethread_cpu;
103 return per_cpu_ptr(wq->cpu_wq, cpu); 103 return per_cpu_ptr(wq->cpu_wq, cpu);
104} 104}
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
410 */ 410 */
411void flush_workqueue(struct workqueue_struct *wq) 411void flush_workqueue(struct workqueue_struct *wq)
412{ 412{
413 const cpumask_t *cpu_map = wq_cpu_map(wq); 413 const struct cpumask *cpu_map = wq_cpu_map(wq);
414 int cpu; 414 int cpu;
415 415
416 might_sleep(); 416 might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
532{ 532{
533 struct cpu_workqueue_struct *cwq; 533 struct cpu_workqueue_struct *cwq;
534 struct workqueue_struct *wq; 534 struct workqueue_struct *wq;
535 const cpumask_t *cpu_map; 535 const struct cpumask *cpu_map;
536 int cpu; 536 int cpu;
537 537
538 might_sleep(); 538 might_sleep();
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
769{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
771 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
772 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
773 struct task_struct *p; 773 struct task_struct *p;
774 774
775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
903 */ 903 */
904void destroy_workqueue(struct workqueue_struct *wq) 904void destroy_workqueue(struct workqueue_struct *wq)
905{ 905{
906 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const struct cpumask *cpu_map = wq_cpu_map(wq);
907 int cpu; 907 int cpu;
908 908
909 cpu_maps_update_begin(); 909 cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
933 933
934 switch (action) { 934 switch (action) {
935 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
936 cpu_set(cpu, cpu_populated_map); 936 cpumask_set_cpu(cpu, cpu_populated_map);
937 } 937 }
938undo: 938undo:
939 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
@@ -964,7 +964,7 @@ undo:
964 switch (action) { 964 switch (action) {
965 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
966 case CPU_POST_DEAD: 966 case CPU_POST_DEAD:
967 cpu_clear(cpu, cpu_populated_map); 967 cpumask_clear_cpu(cpu, cpu_populated_map);
968 } 968 }
969 969
970 return ret; 970 return ret;
@@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
1017 1017
1018void __init init_workqueues(void) 1018void __init init_workqueues(void)
1019{ 1019{
1020 cpu_populated_map = cpu_online_map; 1020 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
1021 singlethread_cpu = first_cpu(cpu_possible_map); 1021
1022 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); 1022 cpumask_copy(cpu_populated_map, cpu_online_mask);
1023 singlethread_cpu = cpumask_first(cpu_possible_mask);
1024 cpu_singlethread_map = cpumask_of(singlethread_cpu);
1023 hotcpu_notifier(workqueue_cpu_callback, 0); 1025 hotcpu_notifier(workqueue_cpu_callback, 0);
1024 keventd_wq = create_workqueue("events"); 1026 keventd_wq = create_workqueue("events");
1025 BUG_ON(!keventd_wq); 1027 BUG_ON(!keventd_wq);