aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 06:52:15 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 06:56:49 -0400
commiteadb8a091b27a840de7450f84ecff5ef13476424 (patch)
tree58c3782d40def63baa8167f3d31e3048cb4c7660 /kernel
parent73874005cd8800440be4299bd095387fff4b90ac (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts: arch/x86/Kconfig arch/x86/kernel/traps.c arch/x86/power/cpu.c arch/x86/power/cpu_32.c kernel/Makefile Semantic conflict: arch/x86/kernel/hw_breakpoint.c Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to put_cpu() in arch/x86/kernel/hw_breakpoint.c. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/async.c13
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/fork.c38
-rw-r--r--kernel/futex.c1208
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c58
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c69
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c38
-rw-r--r--kernel/kallsyms.c134
-rw-r--r--kernel/kexec.c16
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep_internals.h4
-rw-r--r--kernel/module.c66
-rw-r--r--kernel/mutex.c31
-rw-r--r--kernel/panic.c35
-rw-r--r--kernel/params.c46
-rw-r--r--kernel/perf_counter.c4339
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/hibernate.c (renamed from kernel/power/disk.c)59
-rw-r--r--kernel/power/hibernate_nvs.c135
-rw-r--r--kernel/power/main.c526
-rw-r--r--kernel/power/power.h25
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c80
-rw-r--r--kernel/power/suspend.c300
-rw-r--r--kernel/power/suspend_test.c187
-rw-r--r--kernel/power/swsusp.c198
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/profile.c14
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcutree.c25
-rw-r--r--kernel/rcutree_trace.c64
-rw-r--r--kernel/rtmutex.c250
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c401
-rw-r--r--kernel/sched_clock.c3
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_fair.c13
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/signal.c78
-rw-r--r--kernel/slow-work.c27
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/sys.c290
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c71
-rw-r--r--kernel/time/clockevents.c14
-rw-r--r--kernel/time/clocksource.c23
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/timer.c141
-rw-r--r--kernel/trace/Kconfig70
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/blktrace.c91
-rw-r--r--kernel/trace/ftrace.c79
-rw-r--r--kernel/trace/ring_buffer.c114
-rw-r--r--kernel/trace/trace.c26
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions_graph.c6
-rw-r--r--kernel/trace/trace_output.c85
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_sysprof.c3
-rw-r--r--kernel/user.c67
-rw-r--r--kernel/wait.c2
85 files changed, 8179 insertions, 2197 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 18ad1110b226..f88decb1b445 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
@@ -97,6 +98,7 @@ obj-$(CONFIG_X86_DS) += trace/
97obj-$(CONFIG_SMP) += sched_cpupri.o 98obj-$(CONFIG_SMP) += sched_cpupri.o
98obj-$(CONFIG_SLOW_WORK) += slow-work.o 99obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 100obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
100 102
101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/async.c b/kernel/async.c
index 968ef9457d4e..27235f5de198 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -92,19 +92,18 @@ extern int initcall_debug;
92static async_cookie_t __lowest_in_progress(struct list_head *running) 92static async_cookie_t __lowest_in_progress(struct list_head *running)
93{ 93{
94 struct async_entry *entry; 94 struct async_entry *entry;
95
95 if (!list_empty(running)) { 96 if (!list_empty(running)) {
96 entry = list_first_entry(running, 97 entry = list_first_entry(running,
97 struct async_entry, list); 98 struct async_entry, list);
98 return entry->cookie; 99 return entry->cookie;
99 } else if (!list_empty(&async_pending)) {
100 entry = list_first_entry(&async_pending,
101 struct async_entry, list);
102 return entry->cookie;
103 } else {
104 /* nothing in progress... next_cookie is "infinity" */
105 return next_cookie;
106 } 100 }
107 101
102 list_for_each_entry(entry, &async_pending, list)
103 if (entry->running == running)
104 return entry->cookie;
105
106 return next_cookie; /* "infinity" value */
108} 107}
109 108
110static async_cookie_t lowest_in_progress(struct list_head *running) 109static async_cookie_t lowest_in_progress(struct list_head *running)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a82..1f6396d76687 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
568 if (err) 568 if (err)
569 goto skip_it; 569 goto skip_it;
570 570
571 root_mnt = collect_mounts(path.mnt, path.dentry); 571 root_mnt = collect_mounts(&path);
572 path_put(&path); 572 path_put(&path);
573 if (!root_mnt) 573 if (!root_mnt)
574 goto skip_it; 574 goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
660 err = kern_path(tree->pathname, 0, &path); 660 err = kern_path(tree->pathname, 0, &path);
661 if (err) 661 if (err)
662 goto Err; 662 goto Err;
663 mnt = collect_mounts(path.mnt, path.dentry); 663 mnt = collect_mounts(&path);
664 path_put(&path); 664 path_put(&path);
665 if (!mnt) { 665 if (!mnt) {
666 err = -ENOMEM; 666 err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
720 err = kern_path(new, 0, &path); 720 err = kern_path(new, 0, &path);
721 if (err) 721 if (err)
722 return err; 722 return err;
723 tagged = collect_mounts(path.mnt, path.dentry); 723 tagged = collect_mounts(&path);
724 path_put(&path); 724 path_put(&path);
725 if (!tagged) 725 if (!tagged)
726 return -ENOMEM; 726 return -ENOMEM;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765..3fb789f6df94 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h>
49 50
50#include <asm/atomic.h> 51#include <asm/atomic.h>
51 52
@@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
900 struct cgroup *cgrp = &root->top_cgroup; 901 struct cgroup *cgrp = &root->top_cgroup;
901 struct cgroup_sb_opts opts; 902 struct cgroup_sb_opts opts;
902 903
904 lock_kernel();
903 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 905 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
904 mutex_lock(&cgroup_mutex); 906 mutex_lock(&cgroup_mutex);
905 907
@@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
927 kfree(opts.release_agent); 929 kfree(opts.release_agent);
928 mutex_unlock(&cgroup_mutex); 930 mutex_unlock(&cgroup_mutex);
929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 931 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
932 unlock_kernel();
930 return ret; 933 return ret;
931} 934}
932 935
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332} 306}
333 307
334/** 308/*
335 * cpuset_update_task_memory_state - update task memory placement 309 * update task's spread flag if cpuset's page/slab spread flag is set
336 * 310 *
337 * If the current tasks cpusets mems_allowed changed behind our 311 * Called with callback_mutex/cgroup_mutex held
338 * backs, update current->mems_allowed, mems_generation and task NUMA
339 * mempolicy to the new value.
340 *
341 * Task mempolicy is updated by rebinding it relative to the
342 * current->cpuset if a task has its memory placement changed.
343 * Do not call this routine if in_interrupt().
344 *
345 * Call without callback_mutex or task_lock() held. May be
346 * called with or without cgroup_mutex held. Thanks in part to
347 * 'the_top_cpuset_hack', the task's cpuset pointer will never
348 * be NULL. This routine also might acquire callback_mutex during
349 * call.
350 *
351 * Reading current->cpuset->mems_generation doesn't need task_lock
352 * to guard the current->cpuset derefence, because it is guarded
353 * from concurrent freeing of current->cpuset using RCU.
354 *
355 * The rcu_dereference() is technically probably not needed,
356 * as I don't actually mind if I see a new cpuset pointer but
357 * an old value of mems_generation. However this really only
358 * matters on alpha systems using cpusets heavily. If I dropped
359 * that rcu_dereference(), it would save them a memory barrier.
360 * For all other arch's, rcu_dereference is a no-op anyway, and for
361 * alpha systems not using cpusets, another planned optimization,
362 * avoiding the rcu critical section for tasks in the root cpuset
363 * which is statically allocated, so can't vanish, will make this
364 * irrelevant. Better to use RCU as intended, than to engage in
365 * some cute trick to save a memory barrier that is impossible to
366 * test, for alpha systems using cpusets heavily, which might not
367 * even exist.
368 *
369 * This routine is needed to update the per-task mems_allowed data,
370 * within the tasks context, when it is trying to allocate memory
371 * (in various mm/mempolicy.c routines) and notices that some other
372 * task has been modifying its cpuset.
373 */ 312 */
374 313static void cpuset_update_task_spread_flag(struct cpuset *cs,
375void cpuset_update_task_memory_state(void) 314 struct task_struct *tsk)
376{ 315{
377 int my_cpusets_mem_gen; 316 if (is_spread_page(cs))
378 struct task_struct *tsk = current; 317 tsk->flags |= PF_SPREAD_PAGE;
379 struct cpuset *cs; 318 else
380 319 tsk->flags &= ~PF_SPREAD_PAGE;
381 rcu_read_lock(); 320 if (is_spread_slab(cs))
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation; 321 tsk->flags |= PF_SPREAD_SLAB;
383 rcu_read_unlock(); 322 else
384 323 tsk->flags &= ~PF_SPREAD_SLAB;
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk); /* Maybe changed when task not locked */
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403} 324}
404 325
405/* 326/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1007 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1008 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1009 * migrating memory region. 930 * migrating memory region.
1010 *
1011 * We call cpuset_update_task_memory_state() before hacking
1012 * our tasks mems_allowed, so that we are assured of being in
1013 * sync with our tasks cpuset, and in particular, callbacks to
1014 * cpuset_update_task_memory_state() from nested page allocations
1015 * won't see any mismatch of our cpuset and task mems_generation
1016 * values, so won't overwrite our hacked tasks mems_allowed
1017 * nodemask.
1018 */ 931 */
1019 932
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1022{ 935{
1023 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1024 937
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030 939
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032 941
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036} 943}
1037 944
1038/* 945/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1041 */ 969 */
1042static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1046 struct cpuset *cs; 974 struct cpuset *cs;
1047 int migrate; 975 int migrate;
1048 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1049 985
1050 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1051 if (!mm) 987 if (!mm)
1052 return; 988 return;
1053 989
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1056 991
1057 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1104/* 1039/*
1105 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1106 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1107 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1108 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1109 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1110 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1111 * 1046 *
1112 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1113 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 1095
1161 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1165 1099
1166 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1193} 1127}
1194 1128
1195/* 1129/*
1130 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1131 * @tsk: task to be updated
1132 * @scan: struct cgroup_scanner containing the cgroup of the task
1133 *
1134 * Called by cgroup_scan_tasks() for each task in a cgroup.
1135 *
1136 * We don't need to re-check for the cgroup/cpuset membership, since we're
1137 * holding cgroup_lock() at this point.
1138 */
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145/*
1146 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1147 * @cs: the cpuset in which each task's spread flags needs to be changed
1148 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1149 *
1150 * Called with cgroup_mutex held
1151 *
1152 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1153 * calling callback functions for each.
1154 *
1155 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1156 * if @heap != NULL.
1157 */
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169/*
1196 * update_flag - read a 0 or a 1 in a file and update associated flag 1170 * update_flag - read a 0 or a 1 in a file and update associated flag
1197 * bit: the bit to update (see cpuset_flagbits_t) 1171 * bit: the bit to update (see cpuset_flagbits_t)
1198 * cs: the cpuset to update 1172 * cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on) 1179 int turning_on)
1206{ 1180{
1207 struct cpuset *trialcs; 1181 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed; 1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1210 1186
1211 trialcs = alloc_trial_cpuset(cs); 1187 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs) 1188 if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1221 if (err < 0) 1197 if (err < 0)
1222 goto out; 1198 goto out;
1223 1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1224 balance_flag_changed = (is_sched_load_balance(cs) != 1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs)); 1205 is_sched_load_balance(trialcs));
1226 1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1227 mutex_lock(&callback_mutex); 1210 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags; 1211 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains(); 1215 async_rebuild_sched_domains();
1233 1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1234out: 1220out:
1235 free_trial_cpuset(trialcs); 1221 free_trial_cpuset(trialcs);
1236 return err; 1222 return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1372 1358
1373 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1375 } else { 1362 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1379 } 1365 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err) 1367 if (err)
1382 return; 1368 return;
1383 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1373 cpuset_update_task_spread_flag(cs, tsk);
1374
1384 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed; 1376 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk); 1377 mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442 break; 1433 break;
1443 case FILE_SPREAD_PAGE: 1434 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1435 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break; 1436 break;
1447 case FILE_SPREAD_SLAB: 1437 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1438 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break; 1439 break;
1451 default: 1440 default:
1452 retval = -EINVAL; 1441 retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1786 struct cpuset *parent; 1775 struct cpuset *parent;
1787 1776
1788 if (!cont->parent) { 1777 if (!cont->parent) {
1789 /* This is early initialization for the top cgroup */
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css; 1778 return &top_cpuset.css;
1792 } 1779 }
1793 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1799 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1800 } 1787 }
1801 1788
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0; 1789 cs->flags = 0;
1804 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1814 1799
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{ 1812{
1828 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1829 1814
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834 1817
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1849 .early_init = 1, 1832 .early_init = 1,
1850}; 1833};
1851 1834
1852/*
1853 * cpuset_init_early - just enough so that the calls to
1854 * cpuset_update_task_memory_state() in early init code
1855 * are harmless.
1856 */
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867/** 1835/**
1868 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1869 * 1837 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
1874{ 1842{
1875 int err = 0; 1843 int err = 0;
1876 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1877 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1879 1850
1880 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1884 1854
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d707..1bb4d7e5d616 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
167 167
168/* 168/*
169 * Prepare credentials for current to perform an execve() 169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex 170 * - The caller must hold current->cred_guard_mutex
171 */ 171 */
172struct cred *prepare_exec_creds(void) 172struct cred *prepare_exec_creds(void)
173{ 173{
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
276 struct cred *new; 276 struct cred *new;
277 int ret; 277 int ret;
278 278
279 mutex_init(&p->cred_exec_mutex); 279 mutex_init(&p->cred_guard_mutex);
280 280
281 if ( 281 if (
282#ifdef CONFIG_KEYS 282#ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index cab535c427b8..b6c90b5ef509 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 49#include <linux/fs_struct.h>
50#include <linux/init_task.h> 50#include <linux/init_task.h>
51#include <linux/perf_counter.h>
51#include <trace/events/sched.h> 52#include <trace/events/sched.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
@@ -154,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 155{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 157
158#ifdef CONFIG_PERF_COUNTERS
159 WARN_ON_ONCE(tsk->perf_counter_ctxp);
160#endif
157 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
158 put_task_struct(tsk); 162 put_task_struct(tsk);
159} 163}
@@ -170,6 +174,7 @@ repeat:
170 atomic_dec(&__task_cred(p)->user->processes); 174 atomic_dec(&__task_cred(p)->user->processes);
171 175
172 proc_flush_task(p); 176 proc_flush_task(p);
177
173 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
174 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
175 __exit_signal(p); 180 __exit_signal(p);
@@ -971,16 +976,19 @@ NORET_TYPE void do_exit(long code)
971 module_put(tsk->binfmt->module); 976 module_put(tsk->binfmt->module);
972 977
973 proc_exit_connector(tsk); 978 proc_exit_connector(tsk);
979
980 /*
981 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications.
983 */
984 perf_counter_exit_task(tsk);
985
974 exit_notify(tsk, group_dead); 986 exit_notify(tsk, group_dead);
975#ifdef CONFIG_NUMA 987#ifdef CONFIG_NUMA
976 mpol_put(tsk->mempolicy); 988 mpol_put(tsk->mempolicy);
977 tsk->mempolicy = NULL; 989 tsk->mempolicy = NULL;
978#endif 990#endif
979#ifdef CONFIG_FUTEX 991#ifdef CONFIG_FUTEX
980 /*
981 * This must happen late, after the PID is not
982 * hashed anymore:
983 */
984 if (unlikely(!list_empty(&tsk->pi_state_list))) 992 if (unlikely(!list_empty(&tsk->pi_state_list)))
985 exit_pi_state_list(tsk); 993 exit_pi_state_list(tsk);
986 if (unlikely(current->pi_state_cache)) 994 if (unlikely(current->pi_state_cache))
@@ -1472,6 +1480,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1472 */ 1480 */
1473 if (*notask_error) 1481 if (*notask_error)
1474 *notask_error = ret; 1482 *notask_error = ret;
1483 return 0;
1475 } 1484 }
1476 1485
1477 if (likely(!ptrace) && unlikely(p->ptrace)) { 1486 if (likely(!ptrace) && unlikely(p->ptrace)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 711468f3db2a..be022c200da6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -62,6 +62,7 @@
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h>
65 66
66#include <asm/pgtable.h> 67#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -177,7 +178,7 @@ void __init fork_init(unsigned long mempages)
177 /* create a slab on which task_structs can be allocated */ 178 /* create a slab on which task_structs can be allocated */
178 task_struct_cachep = 179 task_struct_cachep =
179 kmem_cache_create("task_struct", sizeof(struct task_struct), 180 kmem_cache_create("task_struct", sizeof(struct task_struct),
180 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 181 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
181#endif 182#endif
182 183
183 /* do the arch specific task caches init */ 184 /* do the arch specific task caches init */
@@ -981,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
981 if (!p) 982 if (!p)
982 goto fork_out; 983 goto fork_out;
983 984
985 ftrace_graph_init_task(p);
986
984 rt_mutex_init_task(p); 987 rt_mutex_init_task(p);
985 988
986#ifdef CONFIG_PROVE_LOCKING 989#ifdef CONFIG_PROVE_LOCKING
@@ -1094,6 +1097,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1094 /* Perform scheduler related setup. Assign this task to a CPU. */ 1097 /* Perform scheduler related setup. Assign this task to a CPU. */
1095 sched_fork(p, clone_flags); 1098 sched_fork(p, clone_flags);
1096 1099
1100 retval = perf_counter_init_task(p);
1101 if (retval)
1102 goto bad_fork_cleanup_policy;
1103
1097 if ((retval = audit_alloc(p))) 1104 if ((retval = audit_alloc(p)))
1098 goto bad_fork_cleanup_policy; 1105 goto bad_fork_cleanup_policy;
1099 /* copy all the process information */ 1106 /* copy all the process information */
@@ -1130,8 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1130 } 1137 }
1131 } 1138 }
1132 1139
1133 ftrace_graph_init_task(p);
1134
1135 p->pid = pid_nr(pid); 1140 p->pid = pid_nr(pid);
1136 p->tgid = p->pid; 1141 p->tgid = p->pid;
1137 if (clone_flags & CLONE_THREAD) 1142 if (clone_flags & CLONE_THREAD)
@@ -1140,7 +1145,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1140 if (current->nsproxy != p->nsproxy) { 1145 if (current->nsproxy != p->nsproxy) {
1141 retval = ns_cgroup_clone(p, pid); 1146 retval = ns_cgroup_clone(p, pid);
1142 if (retval) 1147 if (retval)
1143 goto bad_fork_free_graph; 1148 goto bad_fork_free_pid;
1144 } 1149 }
1145 1150
1146 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1151 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1232,7 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1232 spin_unlock(&current->sighand->siglock); 1237 spin_unlock(&current->sighand->siglock);
1233 write_unlock_irq(&tasklist_lock); 1238 write_unlock_irq(&tasklist_lock);
1234 retval = -ERESTARTNOINTR; 1239 retval = -ERESTARTNOINTR;
1235 goto bad_fork_free_graph; 1240 goto bad_fork_free_pid;
1236 } 1241 }
1237 1242
1238 if (clone_flags & CLONE_THREAD) { 1243 if (clone_flags & CLONE_THREAD) {
@@ -1267,8 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1267 cgroup_post_fork(p); 1272 cgroup_post_fork(p);
1268 return p; 1273 return p;
1269 1274
1270bad_fork_free_graph:
1271 ftrace_graph_exit_task(p);
1272bad_fork_free_pid: 1275bad_fork_free_pid:
1273 if (pid != &init_struct_pid) 1276 if (pid != &init_struct_pid)
1274 free_pid(pid); 1277 free_pid(pid);
@@ -1292,6 +1295,7 @@ bad_fork_cleanup_semundo:
1292bad_fork_cleanup_audit: 1295bad_fork_cleanup_audit:
1293 audit_free(p); 1296 audit_free(p);
1294bad_fork_cleanup_policy: 1297bad_fork_cleanup_policy:
1298 perf_counter_free_task(p);
1295#ifdef CONFIG_NUMA 1299#ifdef CONFIG_NUMA
1296 mpol_put(p->mempolicy); 1300 mpol_put(p->mempolicy);
1297bad_fork_cleanup_cgroup: 1301bad_fork_cleanup_cgroup:
@@ -1405,10 +1409,16 @@ long do_fork(unsigned long clone_flags,
1405 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1406 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1407 init_completion(&vfork); 1411 init_completion(&vfork);
1412 } else if (!(clone_flags & CLONE_VM)) {
1413 /*
1414 * vfork will do an exec which will call
1415 * set_task_comm()
1416 */
1417 perf_counter_fork(p);
1408 } 1418 }
1409 1419
1410 audit_finish_fork(p); 1420 audit_finish_fork(p);
1411 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1421 tracehook_report_clone(regs, clone_flags, nr, p);
1412 1422
1413 /* 1423 /*
1414 * We set PF_STARTING at creation in case tracing wants to 1424 * We set PF_STARTING at creation in case tracing wants to
@@ -1460,20 +1470,20 @@ void __init proc_caches_init(void)
1460{ 1470{
1461 sighand_cachep = kmem_cache_create("sighand_cache", 1471 sighand_cachep = kmem_cache_create("sighand_cache",
1462 sizeof(struct sighand_struct), 0, 1472 sizeof(struct sighand_struct), 0,
1463 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1464 sighand_ctor); 1474 SLAB_NOTRACK, sighand_ctor);
1465 signal_cachep = kmem_cache_create("signal_cache", 1475 signal_cachep = kmem_cache_create("signal_cache",
1466 sizeof(struct signal_struct), 0, 1476 sizeof(struct signal_struct), 0,
1467 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1468 files_cachep = kmem_cache_create("files_cache", 1478 files_cachep = kmem_cache_create("files_cache",
1469 sizeof(struct files_struct), 0, 1479 sizeof(struct files_struct), 0,
1470 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1471 fs_cachep = kmem_cache_create("fs_cache", 1481 fs_cachep = kmem_cache_create("fs_cache",
1472 sizeof(struct fs_struct), 0, 1482 sizeof(struct fs_struct), 0,
1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1474 mm_cachep = kmem_cache_create("mm_struct", 1484 mm_cachep = kmem_cache_create("mm_struct",
1475 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1476 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1477 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1478 mmap_init(); 1488 mmap_init();
1479} 1489}
diff --git a/kernel/futex.c b/kernel/futex.c
index eef8cd26b5e5..80b5ce716596 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
96 */ 100 */
97struct futex_q { 101struct futex_q {
98 struct plist_node list; 102 struct plist_node list;
99 /* There can only be a single waiter */ 103 /* Waiter reference */
100 wait_queue_head_t waiter; 104 struct task_struct *task;
101 105
102 /* Which hash list lock to use: */ 106 /* Which hash list lock to use: */
103 spinlock_t *lock_ptr; 107 spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
107 111
108 /* Optional priority inheritance state: */ 112 /* Optional priority inheritance state: */
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter;
111 117
112 /* Bitset for the optional bitmasked wakeup */ 118 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 119 u32 bitset;
@@ -193,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key)
193 * @uaddr: virtual address of the futex 199 * @uaddr: virtual address of the futex
194 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 200 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
195 * @key: address where result is stored. 201 * @key: address where result is stored.
202 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
196 * 203 *
197 * Returns a negative error code or 0 204 * Returns a negative error code or 0
198 * The key words are stored in *key on success. 205 * The key words are stored in *key on success.
@@ -203,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key)
203 * 210 *
204 * lock_page() might sleep, the caller should not hold a spinlock. 211 * lock_page() might sleep, the caller should not hold a spinlock.
205 */ 212 */
206static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 213static int
214get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
207{ 215{
208 unsigned long address = (unsigned long)uaddr; 216 unsigned long address = (unsigned long)uaddr;
209 struct mm_struct *mm = current->mm; 217 struct mm_struct *mm = current->mm;
@@ -226,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
226 * but access_ok() should be faster than find_vma() 234 * but access_ok() should be faster than find_vma()
227 */ 235 */
228 if (!fshared) { 236 if (!fshared) {
229 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) 237 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
230 return -EFAULT; 238 return -EFAULT;
231 key->private.mm = mm; 239 key->private.mm = mm;
232 key->private.address = address; 240 key->private.address = address;
@@ -235,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
235 } 243 }
236 244
237again: 245again:
238 err = get_user_pages_fast(address, 1, 0, &page); 246 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
239 if (err < 0) 247 if (err < 0)
240 return err; 248 return err;
241 249
@@ -276,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
276 drop_futex_key_refs(key); 284 drop_futex_key_refs(key);
277} 285}
278 286
287/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in
290 * @key: the futex key (to distinguish it from other futex futex_q's)
291 *
292 * Must be called with the hb lock held.
293 */
294static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
295 union futex_key *key)
296{
297 struct futex_q *this;
298
299 plist_for_each_entry(this, &hb->chain, list) {
300 if (match_futex(&this->key, key))
301 return this;
302 }
303 return NULL;
304}
305
279static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 306static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
280{ 307{
281 u32 curval; 308 u32 curval;
@@ -537,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
537 return 0; 564 return 0;
538} 565}
539 566
567/**
568 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
569 * @uaddr: the pi futex user address
570 * @hb: the pi futex hash bucket
571 * @key: the futex key associated with uaddr and hb
572 * @ps: the pi_state pointer where we store the result of the
573 * lookup
574 * @task: the task to perform the atomic lock work for. This will
575 * be "current" except in the case of requeue pi.
576 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
577 *
578 * Returns:
579 * 0 - ready to wait
580 * 1 - acquired the lock
581 * <0 - error
582 *
583 * The hb->lock and futex_key refs shall be held by the caller.
584 */
585static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
586 union futex_key *key,
587 struct futex_pi_state **ps,
588 struct task_struct *task, int set_waiters)
589{
590 int lock_taken, ret, ownerdied = 0;
591 u32 uval, newval, curval;
592
593retry:
594 ret = lock_taken = 0;
595
596 /*
597 * To avoid races, we attempt to take the lock here again
598 * (by doing a 0 -> TID atomic cmpxchg), while holding all
599 * the locks. It will most likely not succeed.
600 */
601 newval = task_pid_vnr(task);
602 if (set_waiters)
603 newval |= FUTEX_WAITERS;
604
605 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
606
607 if (unlikely(curval == -EFAULT))
608 return -EFAULT;
609
610 /*
611 * Detect deadlocks.
612 */
613 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
614 return -EDEADLK;
615
616 /*
617 * Surprise - we got the lock. Just return to userspace:
618 */
619 if (unlikely(!curval))
620 return 1;
621
622 uval = curval;
623
624 /*
625 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
626 * to wake at the next unlock.
627 */
628 newval = curval | FUTEX_WAITERS;
629
630 /*
631 * There are two cases, where a futex might have no owner (the
632 * owner TID is 0): OWNER_DIED. We take over the futex in this
633 * case. We also do an unconditional take over, when the owner
634 * of the futex died.
635 *
636 * This is safe as we are protected by the hash bucket lock !
637 */
638 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
639 /* Keep the OWNER_DIED bit */
640 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
641 ownerdied = 0;
642 lock_taken = 1;
643 }
644
645 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
646
647 if (unlikely(curval == -EFAULT))
648 return -EFAULT;
649 if (unlikely(curval != uval))
650 goto retry;
651
652 /*
653 * We took the lock due to owner died take over.
654 */
655 if (unlikely(lock_taken))
656 return 1;
657
658 /*
659 * We dont have the lock. Look up the PI state (or create it if
660 * we are the first waiter):
661 */
662 ret = lookup_pi_state(uval, hb, key, ps);
663
664 if (unlikely(ret)) {
665 switch (ret) {
666 case -ESRCH:
667 /*
668 * No owner found for this futex. Check if the
669 * OWNER_DIED bit is set to figure out whether
670 * this is a robust futex or not.
671 */
672 if (get_futex_value_locked(&curval, uaddr))
673 return -EFAULT;
674
675 /*
676 * We simply start over in case of a robust
677 * futex. The code above will take the futex
678 * and return happy.
679 */
680 if (curval & FUTEX_OWNER_DIED) {
681 ownerdied = 1;
682 goto retry;
683 }
684 default:
685 break;
686 }
687 }
688
689 return ret;
690}
691
540/* 692/*
541 * The hash bucket lock must be held when this is called. 693 * The hash bucket lock must be held when this is called.
542 * Afterwards, the futex_q must not be accessed. 694 * Afterwards, the futex_q must not be accessed.
543 */ 695 */
544static void wake_futex(struct futex_q *q) 696static void wake_futex(struct futex_q *q)
545{ 697{
546 plist_del(&q->list, &q->list.plist); 698 struct task_struct *p = q->task;
699
547 /* 700 /*
548 * The lock in wake_up_all() is a crucial memory barrier after the 701 * We set q->lock_ptr = NULL _before_ we wake up the task. If
549 * plist_del() and also before assigning to q->lock_ptr. 702 * a non futex wake up happens on another CPU then the task
703 * might exit and p would dereference a non existing task
704 * struct. Prevent this by holding a reference on p across the
705 * wake up.
550 */ 706 */
551 wake_up(&q->waiter); 707 get_task_struct(p);
708
709 plist_del(&q->list, &q->list.plist);
552 /* 710 /*
553 * The waiting task can free the futex_q as soon as this is written, 711 * The waiting task can free the futex_q as soon as
554 * without taking any locks. This must come last. 712 * q->lock_ptr = NULL is written, without taking any locks. A
555 * 713 * memory barrier is required here to prevent the following
556 * A memory barrier is required here to prevent the following store to 714 * store to lock_ptr from getting ahead of the plist_del.
557 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
558 * end of wake_up() does not prevent this store from moving.
559 */ 715 */
560 smp_wmb(); 716 smp_wmb();
561 q->lock_ptr = NULL; 717 q->lock_ptr = NULL;
718
719 wake_up_state(p, TASK_NORMAL);
720 put_task_struct(p);
562} 721}
563 722
564static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 723static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -677,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
677 if (!bitset) 836 if (!bitset)
678 return -EINVAL; 837 return -EINVAL;
679 838
680 ret = get_futex_key(uaddr, fshared, &key); 839 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
681 if (unlikely(ret != 0)) 840 if (unlikely(ret != 0))
682 goto out; 841 goto out;
683 842
@@ -687,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
687 846
688 plist_for_each_entry_safe(this, next, head, list) { 847 plist_for_each_entry_safe(this, next, head, list) {
689 if (match_futex (&this->key, &key)) { 848 if (match_futex (&this->key, &key)) {
690 if (this->pi_state) { 849 if (this->pi_state || this->rt_waiter) {
691 ret = -EINVAL; 850 ret = -EINVAL;
692 break; 851 break;
693 } 852 }
@@ -723,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
723 int ret, op_ret; 882 int ret, op_ret;
724 883
725retry: 884retry:
726 ret = get_futex_key(uaddr1, fshared, &key1); 885 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
727 if (unlikely(ret != 0)) 886 if (unlikely(ret != 0))
728 goto out; 887 goto out;
729 ret = get_futex_key(uaddr2, fshared, &key2); 888 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
730 if (unlikely(ret != 0)) 889 if (unlikely(ret != 0))
731 goto out_put_key1; 890 goto out_put_key1;
732 891
@@ -800,24 +959,185 @@ out:
800 return ret; 959 return ret;
801} 960}
802 961
803/* 962/**
804 * Requeue all waiters hashed on one physical page to another 963 * requeue_futex() - Requeue a futex_q from one hb to another
805 * physical page. 964 * @q: the futex_q to requeue
965 * @hb1: the source hash_bucket
966 * @hb2: the target hash_bucket
967 * @key2: the new key for the requeued futex_q
968 */
969static inline
970void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
971 struct futex_hash_bucket *hb2, union futex_key *key2)
972{
973
974 /*
975 * If key1 and key2 hash to the same bucket, no need to
976 * requeue.
977 */
978 if (likely(&hb1->chain != &hb2->chain)) {
979 plist_del(&q->list, &hb1->chain);
980 plist_add(&q->list, &hb2->chain);
981 q->lock_ptr = &hb2->lock;
982#ifdef CONFIG_DEBUG_PI_LIST
983 q->list.plist.lock = &hb2->lock;
984#endif
985 }
986 get_futex_key_refs(key2);
987 q->key = *key2;
988}
989
990/**
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q
993 * key: the key of the requeue target futex
994 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held.
1000 */
1001static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1003{
1004 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key);
1006 q->key = *key;
1007
1008 WARN_ON(plist_node_empty(&q->list));
1009 plist_del(&q->list, &q->list.plist);
1010
1011 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL;
1013
1014 wake_up_state(q->task, TASK_NORMAL);
1015}
1016
1017/**
1018 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1019 * @pifutex: the user address of the to futex
1020 * @hb1: the from futex hash bucket, must be locked by the caller
1021 * @hb2: the to futex hash bucket, must be locked by the caller
1022 * @key1: the from futex key
1023 * @key2: the to futex key
1024 * @ps: address to store the pi_state pointer
1025 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1026 *
1027 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1028 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1029 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1030 * hb1 and hb2 must be held by the caller.
1031 *
1032 * Returns:
1033 * 0 - failed to acquire the lock atomicly
1034 * 1 - acquired the lock
1035 * <0 - error
1036 */
1037static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1038 struct futex_hash_bucket *hb1,
1039 struct futex_hash_bucket *hb2,
1040 union futex_key *key1, union futex_key *key2,
1041 struct futex_pi_state **ps, int set_waiters)
1042{
1043 struct futex_q *top_waiter = NULL;
1044 u32 curval;
1045 int ret;
1046
1047 if (get_futex_value_locked(&curval, pifutex))
1048 return -EFAULT;
1049
1050 /*
1051 * Find the top_waiter and determine if there are additional waiters.
1052 * If the caller intends to requeue more than 1 waiter to pifutex,
1053 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1054 * as we have means to handle the possible fault. If not, don't set
1055 * the bit unecessarily as it will force the subsequent unlock to enter
1056 * the kernel.
1057 */
1058 top_waiter = futex_top_waiter(hb1, key1);
1059
1060 /* There are no waiters, nothing for us to do. */
1061 if (!top_waiter)
1062 return 0;
1063
1064 /*
1065 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1066 * the contended case or if set_waiters is 1. The pi_state is returned
1067 * in ps in contended cases.
1068 */
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters);
1071 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2);
1073
1074 return ret;
1075}
1076
1077/**
1078 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1079 * uaddr1: source futex user address
1080 * uaddr2: target futex user address
1081 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1082 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1083 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1084 * pi futex (pi to pi requeue is not supported)
1085 *
1086 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1087 * uaddr2 atomically on behalf of the top waiter.
1088 *
1089 * Returns:
1090 * >=0 - on success, the number of tasks requeued or woken
1091 * <0 - on error
806 */ 1092 */
807static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1093static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
808 int nr_wake, int nr_requeue, u32 *cmpval) 1094 int nr_wake, int nr_requeue, u32 *cmpval,
1095 int requeue_pi)
809{ 1096{
810 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1097 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1098 int drop_count = 0, task_count = 0, ret;
1099 struct futex_pi_state *pi_state = NULL;
811 struct futex_hash_bucket *hb1, *hb2; 1100 struct futex_hash_bucket *hb1, *hb2;
812 struct plist_head *head1; 1101 struct plist_head *head1;
813 struct futex_q *this, *next; 1102 struct futex_q *this, *next;
814 int ret, drop_count = 0; 1103 u32 curval2;
1104
1105 if (requeue_pi) {
1106 /*
1107 * requeue_pi requires a pi_state, try to allocate it now
1108 * without any locks in case it fails.
1109 */
1110 if (refill_pi_state_cache())
1111 return -ENOMEM;
1112 /*
1113 * requeue_pi must wake as many tasks as it can, up to nr_wake
1114 * + nr_requeue, since it acquires the rt_mutex prior to
1115 * returning to userspace, so as to not leave the rt_mutex with
1116 * waiters and no owner. However, second and third wake-ups
1117 * cannot be predicted as they involve race conditions with the
1118 * first wake and a fault while looking up the pi_state. Both
1119 * pthread_cond_signal() and pthread_cond_broadcast() should
1120 * use nr_wake=1.
1121 */
1122 if (nr_wake != 1)
1123 return -EINVAL;
1124 }
815 1125
816retry: 1126retry:
817 ret = get_futex_key(uaddr1, fshared, &key1); 1127 if (pi_state != NULL) {
1128 /*
1129 * We will have to lookup the pi_state again, so free this one
1130 * to keep the accounting correct.
1131 */
1132 free_pi_state(pi_state);
1133 pi_state = NULL;
1134 }
1135
1136 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
818 if (unlikely(ret != 0)) 1137 if (unlikely(ret != 0))
819 goto out; 1138 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2); 1139 ret = get_futex_key(uaddr2, fshared, &key2,
1140 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
821 if (unlikely(ret != 0)) 1141 if (unlikely(ret != 0))
822 goto out_put_key1; 1142 goto out_put_key1;
823 1143
@@ -852,32 +1172,99 @@ retry_private:
852 } 1172 }
853 } 1173 }
854 1174
1175 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1176 /*
1177 * Attempt to acquire uaddr2 and wake the top waiter. If we
1178 * intend to requeue waiters, force setting the FUTEX_WAITERS
1179 * bit. We force this here where we are able to easily handle
1180 * faults rather in the requeue loop below.
1181 */
1182 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1183 &key2, &pi_state, nr_requeue);
1184
1185 /*
1186 * At this point the top_waiter has either taken uaddr2 or is
1187 * waiting on it. If the former, then the pi_state will not
1188 * exist yet, look it up one more time to ensure we have a
1189 * reference to it.
1190 */
1191 if (ret == 1) {
1192 WARN_ON(pi_state);
1193 task_count++;
1194 ret = get_futex_value_locked(&curval2, uaddr2);
1195 if (!ret)
1196 ret = lookup_pi_state(curval2, hb2, &key2,
1197 &pi_state);
1198 }
1199
1200 switch (ret) {
1201 case 0:
1202 break;
1203 case -EFAULT:
1204 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2);
1208 if (!ret)
1209 goto retry;
1210 goto out;
1211 case -EAGAIN:
1212 /* The owner was exiting, try again. */
1213 double_unlock_hb(hb1, hb2);
1214 put_futex_key(fshared, &key2);
1215 put_futex_key(fshared, &key1);
1216 cond_resched();
1217 goto retry;
1218 default:
1219 goto out_unlock;
1220 }
1221 }
1222
855 head1 = &hb1->chain; 1223 head1 = &hb1->chain;
856 plist_for_each_entry_safe(this, next, head1, list) { 1224 plist_for_each_entry_safe(this, next, head1, list) {
857 if (!match_futex (&this->key, &key1)) 1225 if (task_count - nr_wake >= nr_requeue)
1226 break;
1227
1228 if (!match_futex(&this->key, &key1))
858 continue; 1229 continue;
859 if (++ret <= nr_wake) { 1230
1231 WARN_ON(!requeue_pi && this->rt_waiter);
1232 WARN_ON(requeue_pi && !this->rt_waiter);
1233
1234 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1236 * lock, we already woke the top_waiter. If not, it will be
1237 * woken by futex_unlock_pi().
1238 */
1239 if (++task_count <= nr_wake && !requeue_pi) {
860 wake_futex(this); 1240 wake_futex(this);
861 } else { 1241 continue;
862 /* 1242 }
863 * If key1 and key2 hash to the same bucket, no need to
864 * requeue.
865 */
866 if (likely(head1 != &hb2->chain)) {
867 plist_del(&this->list, &hb1->chain);
868 plist_add(&this->list, &hb2->chain);
869 this->lock_ptr = &hb2->lock;
870#ifdef CONFIG_DEBUG_PI_LIST
871 this->list.plist.lock = &hb2->lock;
872#endif
873 }
874 this->key = key2;
875 get_futex_key_refs(&key2);
876 drop_count++;
877 1243
878 if (ret - nr_wake >= nr_requeue) 1244 /*
879 break; 1245 * Requeue nr_requeue waiters and possibly one more in the case
1246 * of requeue_pi if we couldn't acquire the lock atomically.
1247 */
1248 if (requeue_pi) {
1249 /* Prepare the waiter to take the rt_mutex. */
1250 atomic_inc(&pi_state->refcount);
1251 this->pi_state = pi_state;
1252 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1253 this->rt_waiter,
1254 this->task, 1);
1255 if (ret == 1) {
1256 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2);
1258 continue;
1259 } else if (ret) {
1260 /* -EDEADLK */
1261 this->pi_state = NULL;
1262 free_pi_state(pi_state);
1263 goto out_unlock;
1264 }
880 } 1265 }
1266 requeue_futex(this, hb1, hb2, &key2);
1267 drop_count++;
881 } 1268 }
882 1269
883out_unlock: 1270out_unlock:
@@ -897,7 +1284,9 @@ out_put_keys:
897out_put_key1: 1284out_put_key1:
898 put_futex_key(fshared, &key1); 1285 put_futex_key(fshared, &key1);
899out: 1286out:
900 return ret; 1287 if (pi_state != NULL)
1288 free_pi_state(pi_state);
1289 return ret ? ret : task_count;
901} 1290}
902 1291
903/* The key must be already stored in q->key. */ 1292/* The key must be already stored in q->key. */
@@ -905,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
905{ 1294{
906 struct futex_hash_bucket *hb; 1295 struct futex_hash_bucket *hb;
907 1296
908 init_waitqueue_head(&q->waiter);
909
910 get_futex_key_refs(&q->key); 1297 get_futex_key_refs(&q->key);
911 hb = hash_futex(&q->key); 1298 hb = hash_futex(&q->key);
912 q->lock_ptr = &hb->lock; 1299 q->lock_ptr = &hb->lock;
@@ -1117,35 +1504,149 @@ handle_fault:
1117 */ 1504 */
1118#define FLAGS_SHARED 0x01 1505#define FLAGS_SHARED 0x01
1119#define FLAGS_CLOCKRT 0x02 1506#define FLAGS_CLOCKRT 0x02
1507#define FLAGS_HAS_TIMEOUT 0x04
1120 1508
1121static long futex_wait_restart(struct restart_block *restart); 1509static long futex_wait_restart(struct restart_block *restart);
1122 1510
1123static int futex_wait(u32 __user *uaddr, int fshared, 1511/**
1124 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1512 * fixup_owner() - Post lock pi_state and corner case management
1513 * @uaddr: user address of the futex
1514 * @fshared: whether the futex is shared (1) or not (0)
1515 * @q: futex_q (contains pi_state and access to the rt_mutex)
1516 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1517 *
1518 * After attempting to lock an rt_mutex, this function is called to cleanup
1519 * the pi_state owner as well as handle race conditions that may allow us to
1520 * acquire the lock. Must be called with the hb lock held.
1521 *
1522 * Returns:
1523 * 1 - success, lock taken
1524 * 0 - success, lock not taken
1525 * <0 - on error (-EFAULT)
1526 */
1527static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1528 int locked)
1125{ 1529{
1126 struct task_struct *curr = current; 1530 struct task_struct *owner;
1127 struct restart_block *restart; 1531 int ret = 0;
1128 DECLARE_WAITQUEUE(wait, curr);
1129 struct futex_hash_bucket *hb;
1130 struct futex_q q;
1131 u32 uval;
1132 int ret;
1133 struct hrtimer_sleeper t;
1134 int rem = 0;
1135 1532
1136 if (!bitset) 1533 if (locked) {
1137 return -EINVAL; 1534 /*
1535 * Got the lock. We might not be the anticipated owner if we
1536 * did a lock-steal - fix up the PI-state in that case:
1537 */
1538 if (q->pi_state->owner != current)
1539 ret = fixup_pi_state_owner(uaddr, q, current, fshared);
1540 goto out;
1541 }
1138 1542
1139 q.pi_state = NULL; 1543 /*
1140 q.bitset = bitset; 1544 * Catch the rare case, where the lock was released when we were on the
1141retry: 1545 * way back before we locked the hash bucket.
1142 q.key = FUTEX_KEY_INIT; 1546 */
1143 ret = get_futex_key(uaddr, fshared, &q.key); 1547 if (q->pi_state->owner == current) {
1144 if (unlikely(ret != 0)) 1548 /*
1549 * Try to get the rt_mutex now. This might fail as some other
1550 * task acquired the rt_mutex after we removed ourself from the
1551 * rt_mutex waiters list.
1552 */
1553 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1554 locked = 1;
1555 goto out;
1556 }
1557
1558 /*
1559 * pi_state is incorrect, some other task did a lock steal and
1560 * we returned due to timeout or signal without taking the
1561 * rt_mutex. Too late. We can access the rt_mutex_owner without
1562 * locking, as the other task is now blocked on the hash bucket
1563 * lock. Fix the state up.
1564 */
1565 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1566 ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
1145 goto out; 1567 goto out;
1568 }
1146 1569
1147retry_private: 1570 /*
1148 hb = queue_lock(&q); 1571 * Paranoia check. If we did not take the lock, then we should not be
1572 * the owner, nor the pending owner, of the rt_mutex.
1573 */
1574 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1575 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1576 "pi-state %p\n", ret,
1577 q->pi_state->pi_mutex.owner,
1578 q->pi_state->owner);
1579
1580out:
1581 return ret ? ret : locked;
1582}
1583
1584/**
1585 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1586 * @hb: the futex hash bucket, must be locked by the caller
1587 * @q: the futex_q to queue up on
1588 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1589 */
1590static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1591 struct hrtimer_sleeper *timeout)
1592{
1593 queue_me(q, hb);
1594
1595 /*
1596 * There might have been scheduling since the queue_me(), as we
1597 * cannot hold a spinlock across the get_user() in case it
1598 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1599 * queueing ourselves into the futex hash. This code thus has to
1600 * rely on the futex_wake() code removing us from hash when it
1601 * wakes us up.
1602 */
1603 set_current_state(TASK_INTERRUPTIBLE);
1604
1605 /* Arm the timer */
1606 if (timeout) {
1607 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1608 if (!hrtimer_active(&timeout->timer))
1609 timeout->task = NULL;
1610 }
1611
1612 /*
1613 * !plist_node_empty() is safe here without any lock.
1614 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1615 */
1616 if (likely(!plist_node_empty(&q->list))) {
1617 /*
1618 * If the timer has already expired, current will already be
1619 * flagged for rescheduling. Only call schedule if there
1620 * is no timeout, or if it has yet to expire.
1621 */
1622 if (!timeout || timeout->task)
1623 schedule();
1624 }
1625 __set_current_state(TASK_RUNNING);
1626}
1627
1628/**
1629 * futex_wait_setup() - Prepare to wait on a futex
1630 * @uaddr: the futex userspace address
1631 * @val: the expected value
1632 * @fshared: whether the futex is shared (1) or not (0)
1633 * @q: the associated futex_q
1634 * @hb: storage for hash_bucket pointer to be returned to caller
1635 *
1636 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1637 * compare it with the expected value. Handle atomic faults internally.
1638 * Return with the hb lock held and a q.key reference on success, and unlocked
1639 * with no q.key reference on failure.
1640 *
1641 * Returns:
1642 * 0 - uaddr contains val and hb has been locked
1643 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1644 */
1645static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1646 struct futex_q *q, struct futex_hash_bucket **hb)
1647{
1648 u32 uval;
1649 int ret;
1149 1650
1150 /* 1651 /*
1151 * Access the page AFTER the hash-bucket is locked. 1652 * Access the page AFTER the hash-bucket is locked.
@@ -1163,95 +1664,83 @@ retry_private:
1163 * A consequence is that futex_wait() can return zero and absorb 1664 * A consequence is that futex_wait() can return zero and absorb
1164 * a wakeup when *uaddr != val on entry to the syscall. This is 1665 * a wakeup when *uaddr != val on entry to the syscall. This is
1165 * rare, but normal. 1666 * rare, but normal.
1166 *
1167 * For shared futexes, we hold the mmap semaphore, so the mapping
1168 * cannot have changed since we looked it up in get_futex_key.
1169 */ 1667 */
1668retry:
1669 q->key = FUTEX_KEY_INIT;
1670 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1671 if (unlikely(ret != 0))
1672 return ret;
1673
1674retry_private:
1675 *hb = queue_lock(q);
1676
1170 ret = get_futex_value_locked(&uval, uaddr); 1677 ret = get_futex_value_locked(&uval, uaddr);
1171 1678
1172 if (unlikely(ret)) { 1679 if (ret) {
1173 queue_unlock(&q, hb); 1680 queue_unlock(q, *hb);
1174 1681
1175 ret = get_user(uval, uaddr); 1682 ret = get_user(uval, uaddr);
1176 if (ret) 1683 if (ret)
1177 goto out_put_key; 1684 goto out;
1178 1685
1179 if (!fshared) 1686 if (!fshared)
1180 goto retry_private; 1687 goto retry_private;
1181 1688
1182 put_futex_key(fshared, &q.key); 1689 put_futex_key(fshared, &q->key);
1183 goto retry; 1690 goto retry;
1184 } 1691 }
1185 ret = -EWOULDBLOCK;
1186 if (unlikely(uval != val)) {
1187 queue_unlock(&q, hb);
1188 goto out_put_key;
1189 }
1190 1692
1191 /* Only actually queue if *uaddr contained val. */ 1693 if (uval != val) {
1192 queue_me(&q, hb); 1694 queue_unlock(q, *hb);
1695 ret = -EWOULDBLOCK;
1696 }
1193 1697
1194 /* 1698out:
1195 * There might have been scheduling since the queue_me(), as we 1699 if (ret)
1196 * cannot hold a spinlock across the get_user() in case it 1700 put_futex_key(fshared, &q->key);
1197 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1701 return ret;
1198 * queueing ourselves into the futex hash. This code thus has to 1702}
1199 * rely on the futex_wake() code removing us from hash when it
1200 * wakes us up.
1201 */
1202 1703
1203 /* add_wait_queue is the barrier after __set_current_state. */ 1704static int futex_wait(u32 __user *uaddr, int fshared,
1204 __set_current_state(TASK_INTERRUPTIBLE); 1705 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1205 add_wait_queue(&q.waiter, &wait); 1706{
1206 /* 1707 struct hrtimer_sleeper timeout, *to = NULL;
1207 * !plist_node_empty() is safe here without any lock. 1708 struct restart_block *restart;
1208 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1709 struct futex_hash_bucket *hb;
1209 */ 1710 struct futex_q q;
1210 if (likely(!plist_node_empty(&q.list))) { 1711 int ret;
1211 if (!abs_time)
1212 schedule();
1213 else {
1214 hrtimer_init_on_stack(&t.timer,
1215 clockrt ? CLOCK_REALTIME :
1216 CLOCK_MONOTONIC,
1217 HRTIMER_MODE_ABS);
1218 hrtimer_init_sleeper(&t, current);
1219 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1220 current->timer_slack_ns);
1221
1222 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1223 if (!hrtimer_active(&t.timer))
1224 t.task = NULL;
1225 1712
1226 /* 1713 if (!bitset)
1227 * the timer could have already expired, in which 1714 return -EINVAL;
1228 * case current would be flagged for rescheduling.
1229 * Don't bother calling schedule.
1230 */
1231 if (likely(t.task))
1232 schedule();
1233 1715
1234 hrtimer_cancel(&t.timer); 1716 q.pi_state = NULL;
1717 q.bitset = bitset;
1718 q.rt_waiter = NULL;
1235 1719
1236 /* Flag if a timeout occured */ 1720 if (abs_time) {
1237 rem = (t.task == NULL); 1721 to = &timeout;
1238 1722
1239 destroy_hrtimer_on_stack(&t.timer); 1723 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
1240 } 1724 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1725 hrtimer_init_sleeper(to, current);
1726 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1727 current->timer_slack_ns);
1241 } 1728 }
1242 __set_current_state(TASK_RUNNING);
1243 1729
1244 /* 1730 /* Prepare to wait on uaddr. */
1245 * NOTE: we don't remove ourselves from the waitqueue because 1731 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1246 * we are the only user of it. 1732 if (ret)
1247 */ 1733 goto out;
1734
1735 /* queue_me and wait for wakeup, timeout, or a signal. */
1736 futex_wait_queue_me(hb, &q, to);
1248 1737
1249 /* If we were woken (and unqueued), we succeeded, whatever. */ 1738 /* If we were woken (and unqueued), we succeeded, whatever. */
1250 ret = 0; 1739 ret = 0;
1251 if (!unqueue_me(&q)) 1740 if (!unqueue_me(&q))
1252 goto out_put_key; 1741 goto out_put_key;
1253 ret = -ETIMEDOUT; 1742 ret = -ETIMEDOUT;
1254 if (rem) 1743 if (to && !to->task)
1255 goto out_put_key; 1744 goto out_put_key;
1256 1745
1257 /* 1746 /*
@@ -1268,7 +1757,7 @@ retry_private:
1268 restart->futex.val = val; 1757 restart->futex.val = val;
1269 restart->futex.time = abs_time->tv64; 1758 restart->futex.time = abs_time->tv64;
1270 restart->futex.bitset = bitset; 1759 restart->futex.bitset = bitset;
1271 restart->futex.flags = 0; 1760 restart->futex.flags = FLAGS_HAS_TIMEOUT;
1272 1761
1273 if (fshared) 1762 if (fshared)
1274 restart->futex.flags |= FLAGS_SHARED; 1763 restart->futex.flags |= FLAGS_SHARED;
@@ -1280,6 +1769,10 @@ retry_private:
1280out_put_key: 1769out_put_key:
1281 put_futex_key(fshared, &q.key); 1770 put_futex_key(fshared, &q.key);
1282out: 1771out:
1772 if (to) {
1773 hrtimer_cancel(&to->timer);
1774 destroy_hrtimer_on_stack(&to->timer);
1775 }
1283 return ret; 1776 return ret;
1284} 1777}
1285 1778
@@ -1288,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
1288{ 1781{
1289 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1782 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1290 int fshared = 0; 1783 int fshared = 0;
1291 ktime_t t; 1784 ktime_t t, *tp = NULL;
1292 1785
1293 t.tv64 = restart->futex.time; 1786 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1787 t.tv64 = restart->futex.time;
1788 tp = &t;
1789 }
1294 restart->fn = do_no_restart_syscall; 1790 restart->fn = do_no_restart_syscall;
1295 if (restart->futex.flags & FLAGS_SHARED) 1791 if (restart->futex.flags & FLAGS_SHARED)
1296 fshared = 1; 1792 fshared = 1;
1297 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1793 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
1298 restart->futex.bitset, 1794 restart->futex.bitset,
1299 restart->futex.flags & FLAGS_CLOCKRT); 1795 restart->futex.flags & FLAGS_CLOCKRT);
1300} 1796}
@@ -1310,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1310 int detect, ktime_t *time, int trylock) 1806 int detect, ktime_t *time, int trylock)
1311{ 1807{
1312 struct hrtimer_sleeper timeout, *to = NULL; 1808 struct hrtimer_sleeper timeout, *to = NULL;
1313 struct task_struct *curr = current;
1314 struct futex_hash_bucket *hb; 1809 struct futex_hash_bucket *hb;
1315 u32 uval, newval, curval; 1810 u32 uval;
1316 struct futex_q q; 1811 struct futex_q q;
1317 int ret, lock_taken, ownerdied = 0; 1812 int res, ret;
1318 1813
1319 if (refill_pi_state_cache()) 1814 if (refill_pi_state_cache())
1320 return -ENOMEM; 1815 return -ENOMEM;
@@ -1328,90 +1823,25 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1328 } 1823 }
1329 1824
1330 q.pi_state = NULL; 1825 q.pi_state = NULL;
1826 q.rt_waiter = NULL;
1331retry: 1827retry:
1332 q.key = FUTEX_KEY_INIT; 1828 q.key = FUTEX_KEY_INIT;
1333 ret = get_futex_key(uaddr, fshared, &q.key); 1829 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
1334 if (unlikely(ret != 0)) 1830 if (unlikely(ret != 0))
1335 goto out; 1831 goto out;
1336 1832
1337retry_private: 1833retry_private:
1338 hb = queue_lock(&q); 1834 hb = queue_lock(&q);
1339 1835
1340retry_locked: 1836 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1341 ret = lock_taken = 0;
1342
1343 /*
1344 * To avoid races, we attempt to take the lock here again
1345 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1346 * the locks. It will most likely not succeed.
1347 */
1348 newval = task_pid_vnr(current);
1349
1350 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1351
1352 if (unlikely(curval == -EFAULT))
1353 goto uaddr_faulted;
1354
1355 /*
1356 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1357 * situation and we return success to user space.
1358 */
1359 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1360 ret = -EDEADLK;
1361 goto out_unlock_put_key;
1362 }
1363
1364 /*
1365 * Surprise - we got the lock. Just return to userspace:
1366 */
1367 if (unlikely(!curval))
1368 goto out_unlock_put_key;
1369
1370 uval = curval;
1371
1372 /*
1373 * Set the WAITERS flag, so the owner will know it has someone
1374 * to wake at next unlock
1375 */
1376 newval = curval | FUTEX_WAITERS;
1377
1378 /*
1379 * There are two cases, where a futex might have no owner (the
1380 * owner TID is 0): OWNER_DIED. We take over the futex in this
1381 * case. We also do an unconditional take over, when the owner
1382 * of the futex died.
1383 *
1384 * This is safe as we are protected by the hash bucket lock !
1385 */
1386 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1387 /* Keep the OWNER_DIED bit */
1388 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1389 ownerdied = 0;
1390 lock_taken = 1;
1391 }
1392
1393 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1394
1395 if (unlikely(curval == -EFAULT))
1396 goto uaddr_faulted;
1397 if (unlikely(curval != uval))
1398 goto retry_locked;
1399
1400 /*
1401 * We took the lock due to owner died take over.
1402 */
1403 if (unlikely(lock_taken))
1404 goto out_unlock_put_key;
1405
1406 /*
1407 * We dont have the lock. Look up the PI state (or create it if
1408 * we are the first waiter):
1409 */
1410 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1411
1412 if (unlikely(ret)) { 1837 if (unlikely(ret)) {
1413 switch (ret) { 1838 switch (ret) {
1414 1839 case 1:
1840 /* We got the lock. */
1841 ret = 0;
1842 goto out_unlock_put_key;
1843 case -EFAULT:
1844 goto uaddr_faulted;
1415 case -EAGAIN: 1845 case -EAGAIN:
1416 /* 1846 /*
1417 * Task is exiting and we just wait for the 1847 * Task is exiting and we just wait for the
@@ -1421,25 +1851,6 @@ retry_locked:
1421 put_futex_key(fshared, &q.key); 1851 put_futex_key(fshared, &q.key);
1422 cond_resched(); 1852 cond_resched();
1423 goto retry; 1853 goto retry;
1424
1425 case -ESRCH:
1426 /*
1427 * No owner found for this futex. Check if the
1428 * OWNER_DIED bit is set to figure out whether
1429 * this is a robust futex or not.
1430 */
1431 if (get_futex_value_locked(&curval, uaddr))
1432 goto uaddr_faulted;
1433
1434 /*
1435 * We simply start over in case of a robust
1436 * futex. The code above will take the futex
1437 * and return happy.
1438 */
1439 if (curval & FUTEX_OWNER_DIED) {
1440 ownerdied = 1;
1441 goto retry_locked;
1442 }
1443 default: 1854 default:
1444 goto out_unlock_put_key; 1855 goto out_unlock_put_key;
1445 } 1856 }
@@ -1463,71 +1874,21 @@ retry_locked:
1463 } 1874 }
1464 1875
1465 spin_lock(q.lock_ptr); 1876 spin_lock(q.lock_ptr);
1466 1877 /*
1467 if (!ret) { 1878 * Fixup the pi_state owner and possibly acquire the lock if we
1468 /* 1879 * haven't already.
1469 * Got the lock. We might not be the anticipated owner 1880 */
1470 * if we did a lock-steal - fix up the PI-state in 1881 res = fixup_owner(uaddr, fshared, &q, !ret);
1471 * that case: 1882 /*
1472 */ 1883 * If fixup_owner() returned an error, proprogate that. If it acquired
1473 if (q.pi_state->owner != curr) 1884 * the lock, clear our -ETIMEDOUT or -EINTR.
1474 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1885 */
1475 } else { 1886 if (res)
1476 /* 1887 ret = (res < 0) ? res : 0;
1477 * Catch the rare case, where the lock was released
1478 * when we were on the way back before we locked the
1479 * hash bucket.
1480 */
1481 if (q.pi_state->owner == curr) {
1482 /*
1483 * Try to get the rt_mutex now. This might
1484 * fail as some other task acquired the
1485 * rt_mutex after we removed ourself from the
1486 * rt_mutex waiters list.
1487 */
1488 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1489 ret = 0;
1490 else {
1491 /*
1492 * pi_state is incorrect, some other
1493 * task did a lock steal and we
1494 * returned due to timeout or signal
1495 * without taking the rt_mutex. Too
1496 * late. We can access the
1497 * rt_mutex_owner without locking, as
1498 * the other task is now blocked on
1499 * the hash bucket lock. Fix the state
1500 * up.
1501 */
1502 struct task_struct *owner;
1503 int res;
1504
1505 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1506 res = fixup_pi_state_owner(uaddr, &q, owner,
1507 fshared);
1508
1509 /* propagate -EFAULT, if the fixup failed */
1510 if (res)
1511 ret = res;
1512 }
1513 } else {
1514 /*
1515 * Paranoia check. If we did not take the lock
1516 * in the trylock above, then we should not be
1517 * the owner of the rtmutex, neither the real
1518 * nor the pending one:
1519 */
1520 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1521 printk(KERN_ERR "futex_lock_pi: ret = %d "
1522 "pi-mutex: %p pi-state %p\n", ret,
1523 q.pi_state->pi_mutex.owner,
1524 q.pi_state->owner);
1525 }
1526 }
1527 1888
1528 /* 1889 /*
1529 * If fixup_pi_state_owner() faulted and was unable to handle the 1890 * If fixup_owner() faulted and was unable to handle the fault, unlock
1530 * fault, unlock it and return the fault to userspace. 1891 * it and return the fault to userspace.
1531 */ 1892 */
1532 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 1893 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1533 rt_mutex_unlock(&q.pi_state->pi_mutex); 1894 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1535,9 +1896,7 @@ retry_locked:
1535 /* Unqueue and drop the lock */ 1896 /* Unqueue and drop the lock */
1536 unqueue_me_pi(&q); 1897 unqueue_me_pi(&q);
1537 1898
1538 if (to) 1899 goto out;
1539 destroy_hrtimer_on_stack(&to->timer);
1540 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1541 1900
1542out_unlock_put_key: 1901out_unlock_put_key:
1543 queue_unlock(&q, hb); 1902 queue_unlock(&q, hb);
@@ -1547,7 +1906,7 @@ out_put_key:
1547out: 1906out:
1548 if (to) 1907 if (to)
1549 destroy_hrtimer_on_stack(&to->timer); 1908 destroy_hrtimer_on_stack(&to->timer);
1550 return ret; 1909 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1551 1910
1552uaddr_faulted: 1911uaddr_faulted:
1553 /* 1912 /*
@@ -1570,7 +1929,6 @@ uaddr_faulted:
1570 goto retry; 1929 goto retry;
1571} 1930}
1572 1931
1573
1574/* 1932/*
1575 * Userspace attempted a TID -> 0 atomic transition, and failed. 1933 * Userspace attempted a TID -> 0 atomic transition, and failed.
1576 * This is the in-kernel slowpath: we look up the PI state (if any), 1934 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1594,7 +1952,7 @@ retry:
1594 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1952 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1595 return -EPERM; 1953 return -EPERM;
1596 1954
1597 ret = get_futex_key(uaddr, fshared, &key); 1955 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
1598 if (unlikely(ret != 0)) 1956 if (unlikely(ret != 0))
1599 goto out; 1957 goto out;
1600 1958
@@ -1672,6 +2030,229 @@ pi_faulted:
1672 return ret; 2030 return ret;
1673} 2031}
1674 2032
2033/**
2034 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2035 * @hb: the hash_bucket futex_q was original enqueued on
2036 * @q: the futex_q woken while waiting to be requeued
2037 * @key2: the futex_key of the requeue target futex
2038 * @timeout: the timeout associated with the wait (NULL if none)
2039 *
2040 * Detect if the task was woken on the initial futex as opposed to the requeue
2041 * target futex. If so, determine if it was a timeout or a signal that caused
2042 * the wakeup and return the appropriate error code to the caller. Must be
2043 * called with the hb lock held.
2044 *
2045 * Returns
2046 * 0 - no early wakeup detected
2047 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2048 */
2049static inline
2050int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2051 struct futex_q *q, union futex_key *key2,
2052 struct hrtimer_sleeper *timeout)
2053{
2054 int ret = 0;
2055
2056 /*
2057 * With the hb lock held, we avoid races while we process the wakeup.
2058 * We only need to hold hb (and not hb2) to ensure atomicity as the
2059 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2060 * It can't be requeued from uaddr2 to something else since we don't
2061 * support a PI aware source futex for requeue.
2062 */
2063 if (!match_futex(&q->key, key2)) {
2064 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2065 /*
2066 * We were woken prior to requeue by a timeout or a signal.
2067 * Unqueue the futex_q and determine which it was.
2068 */
2069 plist_del(&q->list, &q->list.plist);
2070 drop_futex_key_refs(&q->key);
2071
2072 if (timeout && !timeout->task)
2073 ret = -ETIMEDOUT;
2074 else
2075 ret = -ERESTARTNOINTR;
2076 }
2077 return ret;
2078}
2079
2080/**
2081 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2082 * @uaddr: the futex we initialyl wait on (non-pi)
2083 * @fshared: whether the futexes are shared (1) or not (0). They must be
2084 * the same type, no requeueing from private to shared, etc.
2085 * @val: the expected value of uaddr
2086 * @abs_time: absolute timeout
2087 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2088 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2089 * @uaddr2: the pi futex we will take prior to returning to user-space
2090 *
2091 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2092 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2093 * complete the acquisition of the rt_mutex prior to returning to userspace.
2094 * This ensures the rt_mutex maintains an owner when it has waiters; without
2095 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2096 * need to.
2097 *
2098 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2099 * via the following:
2100 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2101 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2102 * 3) signal (before or after requeue)
2103 * 4) timeout (before or after requeue)
2104 *
2105 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2106 *
2107 * If 2, we may then block on trying to take the rt_mutex and return via:
2108 * 5) successful lock
2109 * 6) signal
2110 * 7) timeout
2111 * 8) other lock acquisition failure
2112 *
2113 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2114 *
2115 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2116 *
2117 * Returns:
2118 * 0 - On success
2119 * <0 - On error
2120 */
2121static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2122 u32 val, ktime_t *abs_time, u32 bitset,
2123 int clockrt, u32 __user *uaddr2)
2124{
2125 struct hrtimer_sleeper timeout, *to = NULL;
2126 struct rt_mutex_waiter rt_waiter;
2127 struct rt_mutex *pi_mutex = NULL;
2128 struct futex_hash_bucket *hb;
2129 union futex_key key2;
2130 struct futex_q q;
2131 int res, ret;
2132
2133 if (!bitset)
2134 return -EINVAL;
2135
2136 if (abs_time) {
2137 to = &timeout;
2138 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2139 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2140 hrtimer_init_sleeper(to, current);
2141 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2142 current->timer_slack_ns);
2143 }
2144
2145 /*
2146 * The waiter is allocated on our stack, manipulated by the requeue
2147 * code while we sleep on uaddr.
2148 */
2149 debug_rt_mutex_init_waiter(&rt_waiter);
2150 rt_waiter.task = NULL;
2151
2152 q.pi_state = NULL;
2153 q.bitset = bitset;
2154 q.rt_waiter = &rt_waiter;
2155
2156 key2 = FUTEX_KEY_INIT;
2157 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2158 if (unlikely(ret != 0))
2159 goto out;
2160
2161 /* Prepare to wait on uaddr. */
2162 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2163 if (ret)
2164 goto out_key2;
2165
2166 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2167 futex_wait_queue_me(hb, &q, to);
2168
2169 spin_lock(&hb->lock);
2170 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2171 spin_unlock(&hb->lock);
2172 if (ret)
2173 goto out_put_keys;
2174
2175 /*
2176 * In order for us to be here, we know our q.key == key2, and since
2177 * we took the hb->lock above, we also know that futex_requeue() has
2178 * completed and we no longer have to concern ourselves with a wakeup
2179 * race with the atomic proxy lock acquition by the requeue code.
2180 */
2181
2182 /* Check if the requeue code acquired the second futex for us. */
2183 if (!q.rt_waiter) {
2184 /*
2185 * Got the lock. We might not be the anticipated owner if we
2186 * did a lock-steal - fix up the PI-state in that case.
2187 */
2188 if (q.pi_state && (q.pi_state->owner != current)) {
2189 spin_lock(q.lock_ptr);
2190 ret = fixup_pi_state_owner(uaddr2, &q, current,
2191 fshared);
2192 spin_unlock(q.lock_ptr);
2193 }
2194 } else {
2195 /*
2196 * We have been woken up by futex_unlock_pi(), a timeout, or a
2197 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2198 * the pi_state.
2199 */
2200 WARN_ON(!&q.pi_state);
2201 pi_mutex = &q.pi_state->pi_mutex;
2202 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2203 debug_rt_mutex_free_waiter(&rt_waiter);
2204
2205 spin_lock(q.lock_ptr);
2206 /*
2207 * Fixup the pi_state owner and possibly acquire the lock if we
2208 * haven't already.
2209 */
2210 res = fixup_owner(uaddr2, fshared, &q, !ret);
2211 /*
2212 * If fixup_owner() returned an error, proprogate that. If it
2213 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2214 */
2215 if (res)
2216 ret = (res < 0) ? res : 0;
2217
2218 /* Unqueue and drop the lock. */
2219 unqueue_me_pi(&q);
2220 }
2221
2222 /*
2223 * If fixup_pi_state_owner() faulted and was unable to handle the
2224 * fault, unlock the rt_mutex and return the fault to userspace.
2225 */
2226 if (ret == -EFAULT) {
2227 if (rt_mutex_owner(pi_mutex) == current)
2228 rt_mutex_unlock(pi_mutex);
2229 } else if (ret == -EINTR) {
2230 /*
2231 * We've already been requeued, but we have no way to
2232 * restart by calling futex_lock_pi() directly. We
2233 * could restart the syscall, but that will look at
2234 * the user space value and return right away. So we
2235 * drop back with EWOULDBLOCK to tell user space that
2236 * "val" has been changed. That's the same what the
2237 * restart of the syscall would do in
2238 * futex_wait_setup().
2239 */
2240 ret = -EWOULDBLOCK;
2241 }
2242
2243out_put_keys:
2244 put_futex_key(fshared, &q.key);
2245out_key2:
2246 put_futex_key(fshared, &key2);
2247
2248out:
2249 if (to) {
2250 hrtimer_cancel(&to->timer);
2251 destroy_hrtimer_on_stack(&to->timer);
2252 }
2253 return ret;
2254}
2255
1675/* 2256/*
1676 * Support for robust futexes: the kernel cleans up held futexes at 2257 * Support for robust futexes: the kernel cleans up held futexes at
1677 * thread exit time. 2258 * thread exit time.
@@ -1894,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1894 fshared = 1; 2475 fshared = 1;
1895 2476
1896 clockrt = op & FUTEX_CLOCK_REALTIME; 2477 clockrt = op & FUTEX_CLOCK_REALTIME;
1897 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2478 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
1898 return -ENOSYS; 2479 return -ENOSYS;
1899 2480
1900 switch (cmd) { 2481 switch (cmd) {
@@ -1909,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1909 ret = futex_wake(uaddr, fshared, val, val3); 2490 ret = futex_wake(uaddr, fshared, val, val3);
1910 break; 2491 break;
1911 case FUTEX_REQUEUE: 2492 case FUTEX_REQUEUE:
1912 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2493 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
1913 break; 2494 break;
1914 case FUTEX_CMP_REQUEUE: 2495 case FUTEX_CMP_REQUEUE:
1915 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2496 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2497 0);
1916 break; 2498 break;
1917 case FUTEX_WAKE_OP: 2499 case FUTEX_WAKE_OP:
1918 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2500 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1929,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1929 if (futex_cmpxchg_enabled) 2511 if (futex_cmpxchg_enabled)
1930 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2512 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
1931 break; 2513 break;
2514 case FUTEX_WAIT_REQUEUE_PI:
2515 val3 = FUTEX_BITSET_MATCH_ANY;
2516 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2517 clockrt, uaddr2);
2518 break;
2519 case FUTEX_CMP_REQUEUE_PI:
2520 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2521 1);
2522 break;
1932 default: 2523 default:
1933 ret = -ENOSYS; 2524 ret = -ENOSYS;
1934 } 2525 }
@@ -1946,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1946 int cmd = op & FUTEX_CMD_MASK; 2537 int cmd = op & FUTEX_CMD_MASK;
1947 2538
1948 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2539 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
1949 cmd == FUTEX_WAIT_BITSET)) { 2540 cmd == FUTEX_WAIT_BITSET ||
2541 cmd == FUTEX_WAIT_REQUEUE_PI)) {
1950 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2542 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1951 return -EFAULT; 2543 return -EFAULT;
1952 if (!timespec_valid(&ts)) 2544 if (!timespec_valid(&ts))
@@ -1958,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1958 tp = &t; 2550 tp = &t;
1959 } 2551 }
1960 /* 2552 /*
1961 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2553 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
1962 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2554 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1963 */ 2555 */
1964 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2556 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
1965 cmd == FUTEX_WAKE_OP) 2557 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
1966 val2 = (u32) (unsigned long) utime; 2558 val2 = (u32) (unsigned long) utime;
1967 2559
1968 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2560 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
1/*
2 * Supplementary group IDs
3 */
4#include <linux/cred.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/security.h>
8#include <linux/syscalls.h>
9#include <asm/uaccess.h>
10
11/* init to 2 - one for init_task, one to ensure it is never freed */
12struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
13
14struct group_info *groups_alloc(int gidsetsize)
15{
16 struct group_info *group_info;
17 int nblocks;
18 int i;
19
20 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
21 /* Make sure we always allocate at least one indirect block pointer */
22 nblocks = nblocks ? : 1;
23 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
24 if (!group_info)
25 return NULL;
26 group_info->ngroups = gidsetsize;
27 group_info->nblocks = nblocks;
28 atomic_set(&group_info->usage, 1);
29
30 if (gidsetsize <= NGROUPS_SMALL)
31 group_info->blocks[0] = group_info->small_block;
32 else {
33 for (i = 0; i < nblocks; i++) {
34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER);
36 if (!b)
37 goto out_undo_partial_alloc;
38 group_info->blocks[i] = b;
39 }
40 }
41 return group_info;
42
43out_undo_partial_alloc:
44 while (--i >= 0) {
45 free_page((unsigned long)group_info->blocks[i]);
46 }
47 kfree(group_info);
48 return NULL;
49}
50
51EXPORT_SYMBOL(groups_alloc);
52
53void groups_free(struct group_info *group_info)
54{
55 if (group_info->blocks[0] != group_info->small_block) {
56 int i;
57 for (i = 0; i < group_info->nblocks; i++)
58 free_page((unsigned long)group_info->blocks[i]);
59 }
60 kfree(group_info);
61}
62
63EXPORT_SYMBOL(groups_free);
64
65/* export the group_info to a user-space array */
66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info)
68{
69 int i;
70 unsigned int count = group_info->ngroups;
71
72 for (i = 0; i < group_info->nblocks; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
74 unsigned int len = cp_count * sizeof(*grouplist);
75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 }
82 return 0;
83}
84
85/* fill a group_info from a user-space array - it must be allocated already */
86static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist)
88{
89 int i;
90 unsigned int count = group_info->ngroups;
91
92 for (i = 0; i < group_info->nblocks; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
94 unsigned int len = cp_count * sizeof(*grouplist);
95
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT;
98
99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 }
102 return 0;
103}
104
105/* a simple Shell sort */
106static void groups_sort(struct group_info *group_info)
107{
108 int base, max, stride;
109 int gidsetsize = group_info->ngroups;
110
111 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
112 ; /* nothing */
113 stride /= 3;
114
115 while (stride) {
116 max = gidsetsize - stride;
117 for (base = 0; base < max; base++) {
118 int left = base;
119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right);
121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left);
125 right = left;
126 left -= stride;
127 }
128 GROUP_AT(group_info, right) = tmp;
129 }
130 stride /= 3;
131 }
132}
133
134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp)
136{
137 unsigned int left, right;
138
139 if (!group_info)
140 return 0;
141
142 left = 0;
143 right = group_info->ngroups;
144 while (left < right) {
145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid);
147 if (cmp > 0)
148 left = mid + 1;
149 else if (cmp < 0)
150 right = mid;
151 else
152 return 1;
153 }
154 return 0;
155}
156
157/**
158 * set_groups - Change a group subscription in a set of credentials
159 * @new: The newly prepared set of credentials to alter
160 * @group_info: The group list to install
161 *
162 * Validate a group subscription and, if valid, insert it into a set
163 * of credentials.
164 */
165int set_groups(struct cred *new, struct group_info *group_info)
166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info);
174 groups_sort(group_info);
175 get_group_info(group_info);
176 new->group_info = group_info;
177 return 0;
178}
179
180EXPORT_SYMBOL(set_groups);
181
182/**
183 * set_current_groups - Change current's group subscription
184 * @group_info: The group list to impose
185 *
186 * Validate a group subscription and, if valid, impose it upon current's task
187 * security record.
188 */
189int set_current_groups(struct group_info *group_info)
190{
191 struct cred *new;
192 int ret;
193
194 new = prepare_creds();
195 if (!new)
196 return -ENOMEM;
197
198 ret = set_groups(new, group_info);
199 if (ret < 0) {
200 abort_creds(new);
201 return ret;
202 }
203
204 return commit_creds(new);
205}
206
207EXPORT_SYMBOL(set_current_groups);
208
209SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
210{
211 const struct cred *cred = current_cred();
212 int i;
213
214 if (gidsetsize < 0)
215 return -EINVAL;
216
217 /* no need to grab task_lock here; it cannot change */
218 i = cred->group_info->ngroups;
219 if (gidsetsize) {
220 if (i > gidsetsize) {
221 i = -EINVAL;
222 goto out;
223 }
224 if (groups_to_user(grouplist, cred->group_info)) {
225 i = -EFAULT;
226 goto out;
227 }
228 }
229out:
230 return i;
231}
232
233/*
234 * SMP: Our groups are copy-on-write. We can set them safely
235 * without another task interfering.
236 */
237
238SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
239{
240 struct group_info *group_info;
241 int retval;
242
243 if (!capable(CAP_SETGID))
244 return -EPERM;
245 if ((unsigned)gidsetsize > NGROUPS_MAX)
246 return -EINVAL;
247
248 group_info = groups_alloc(gidsetsize);
249 if (!group_info)
250 return -ENOMEM;
251 retval = groups_from_user(group_info, grouplist);
252 if (retval) {
253 put_group_info(group_info);
254 return retval;
255 }
256
257 retval = set_current_groups(group_info);
258 put_group_info(group_info);
259
260 return retval;
261}
262
263/*
264 * Check whether we're fsgid/egid or in the supplemental group..
265 */
266int in_group_p(gid_t grp)
267{
268 const struct cred *cred = current_cred();
269 int retval = 1;
270
271 if (grp != cred->fsgid)
272 retval = groups_search(cred->group_info, grp);
273 return retval;
274}
275
276EXPORT_SYMBOL(in_group_p);
277
278int in_egroup_p(gid_t grp)
279{
280 const struct cred *cred = current_cred();
281 int retval = 1;
282
283 if (grp != cred->egid)
284 retval = groups_search(cred->group_info, grp);
285 return retval;
286}
287
288EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..b675a67c9ac3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/timer.h>
46 48
47#include <asm/uaccess.h> 49#include <asm/uaccess.h>
48 50
@@ -193,12 +195,24 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
193 * Switch the timer base to the current CPU when possible. 195 * Switch the timer base to the current CPU when possible.
194 */ 196 */
195static inline struct hrtimer_clock_base * 197static inline struct hrtimer_clock_base *
196switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) 198switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
199 int pinned)
197{ 200{
198 struct hrtimer_clock_base *new_base; 201 struct hrtimer_clock_base *new_base;
199 struct hrtimer_cpu_base *new_cpu_base; 202 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1;
204
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
200 213
201 new_cpu_base = &__get_cpu_var(hrtimer_bases); 214again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
202 new_base = &new_cpu_base->clock_base[base->index]; 216 new_base = &new_cpu_base->clock_base[base->index];
203 217
204 if (base != new_base) { 218 if (base != new_base) {
@@ -218,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
218 timer->base = NULL; 232 timer->base = NULL;
219 spin_unlock(&base->cpu_base->lock); 233 spin_unlock(&base->cpu_base->lock);
220 spin_lock(&new_base->cpu_base->lock); 234 spin_lock(&new_base->cpu_base->lock);
235
236 /* Optimized away for NOHZ=n SMP=n */
237 if (cpu == preferred_cpu) {
238 /* Calculate clock monotonic expiry time */
239#ifdef CONFIG_HIGH_RES_TIMERS
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
241 new_base->offset);
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 }
221 timer->base = new_base; 269 timer->base = new_base;
222 } 270 }
223 return new_base; 271 return new_base;
@@ -235,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
235 return base; 283 return base;
236} 284}
237 285
238# define switch_hrtimer_base(t, b) (b) 286# define switch_hrtimer_base(t, b, p) (b)
239 287
240#endif /* !CONFIG_SMP */ 288#endif /* !CONFIG_SMP */
241 289
@@ -907,9 +955,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
907 ret = remove_hrtimer(timer, base); 955 ret = remove_hrtimer(timer, base);
908 956
909 /* Switch the timer base, if necessary: */ 957 /* Switch the timer base, if necessary: */
910 new_base = switch_hrtimer_base(timer, base); 958 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
911 959
912 if (mode == HRTIMER_MODE_REL) { 960 if (mode & HRTIMER_MODE_REL) {
913 tim = ktime_add_safe(tim, new_base->get_time()); 961 tim = ktime_add_safe(tim, new_base->get_time());
914 /* 962 /*
915 * CONFIG_TIME_LOW_RES is a temporary way for architectures 963 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
359 359
360 spin_lock(&desc->lock); 360 spin_lock(&desc->lock);
361 mask_ack_irq(desc, irq); 361 mask_ack_irq(desc, irq);
362 desc = irq_remap_to_desc(irq, desc);
363 362
364 if (unlikely(desc->status & IRQ_INPROGRESS)) 363 if (unlikely(desc->status & IRQ_INPROGRESS))
365 goto out_unlock; 364 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
438 desc->status &= ~IRQ_INPROGRESS; 437 desc->status &= ~IRQ_INPROGRESS;
439out: 438out:
440 desc->chip->eoi(irq); 439 desc->chip->eoi(irq);
441 desc = irq_remap_to_desc(irq, desc);
442 440
443 spin_unlock(&desc->lock); 441 spin_unlock(&desc->lock);
444} 442}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 !desc->action)) { 473 !desc->action)) {
476 desc->status |= (IRQ_PENDING | IRQ_MASKED); 474 desc->status |= (IRQ_PENDING | IRQ_MASKED);
477 mask_ack_irq(desc, irq); 475 mask_ack_irq(desc, irq);
478 desc = irq_remap_to_desc(irq, desc);
479 goto out_unlock; 476 goto out_unlock;
480 } 477 }
481 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
483 /* Start handling the irq */ 480 /* Start handling the irq */
484 if (desc->chip->ack) 481 if (desc->chip->ack)
485 desc->chip->ack(irq); 482 desc->chip->ack(irq);
486 desc = irq_remap_to_desc(irq, desc);
487 483
488 /* Mark the IRQ currently in progress.*/ 484 /* Mark the IRQ currently in progress.*/
489 desc->status |= IRQ_INPROGRESS; 485 desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 if (!noirqdebug) 540 if (!noirqdebug)
545 note_interrupt(irq, desc, action_ret); 541 note_interrupt(irq, desc, action_ret);
546 542
547 if (desc->chip->eoi) { 543 if (desc->chip->eoi)
548 desc->chip->eoi(irq); 544 desc->chip->eoi(irq);
549 desc = irq_remap_to_desc(irq, desc);
550 }
551} 545}
552 546
553void 547void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
582 576
583 /* Uninstall? */ 577 /* Uninstall? */
584 if (handle == handle_bad_irq) { 578 if (handle == handle_bad_irq) {
585 if (desc->chip != &no_irq_chip) { 579 if (desc->chip != &no_irq_chip)
586 mask_ack_irq(desc, irq); 580 mask_ack_irq(desc, irq);
587 desc = irq_remap_to_desc(irq, desc);
588 }
589 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
590 desc->depth = 1; 582 desc->depth = 1;
591 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5dd2572993cf..065205bdd920 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/slab.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/random.h> 16#include <linux/random.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
@@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
44#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 45#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
45static void __init init_irq_default_affinity(void) 46static void __init init_irq_default_affinity(void)
46{ 47{
47 alloc_bootmem_cpumask_var(&irq_default_affinity); 48 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
48 cpumask_setall(irq_default_affinity); 49 cpumask_setall(irq_default_affinity);
49} 50}
50#else 51#else
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
82}; 83};
83 84
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
85{ 86{
86 int node;
87 void *ptr; 87 void *ptr;
88 88
89 node = cpu_to_node(cpu); 89 if (slab_is_available())
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92 else
93 ptr = alloc_bootmem_node(NODE_DATA(node),
94 nr * sizeof(*desc->kstat_irqs));
91 95
92 /* 96 /*
93 * don't overwite if can not get new one 97 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one 98 * init_copy_kstat_irqs() could still use old one
95 */ 99 */
96 if (ptr) { 100 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", 101 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
98 cpu, node);
99 desc->kstat_irqs = ptr; 102 desc->kstat_irqs = ptr;
100 } 103 }
101} 104}
102 105
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 106static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{ 107{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106 109
107 spin_lock_init(&desc->lock); 110 spin_lock_init(&desc->lock);
108 desc->irq = irq; 111 desc->irq = irq;
109#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
110 desc->cpu = cpu; 113 desc->node = node;
111#endif 114#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 115 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, cpu, nr_cpu_ids); 116 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) { 117 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n"); 118 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1); 119 BUG_ON(1);
117 } 120 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) { 121 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); 122 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1); 123 BUG_ON(1);
121 } 124 }
122 arch_init_chip_data(desc, cpu); 125 init_desc_masks(desc);
126 arch_init_chip_data(desc, node);
123} 127}
124 128
125/* 129/*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
146{ 150{
147 struct irq_desc *desc; 151 struct irq_desc *desc;
148 int legacy_count; 152 int legacy_count;
153 int node;
149 int i; 154 int i;
150 155
151 init_irq_default_affinity(); 156 init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
156 161
157 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
158 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node;
159 165
160 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
161 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
162 168
163 /* allocate based on nr_cpu_ids */ 169 /* allocate based on nr_cpu_ids */
164 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ 170 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
165 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * 171 sizeof(int), GFP_NOWAIT, node);
166 sizeof(int));
167 172
168 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
169 desc[i].irq = i; 174 desc[i].irq = i;
170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
172 init_alloc_desc_masks(&desc[i], 0, true); 177 alloc_desc_masks(&desc[i], node, true);
178 init_desc_masks(&desc[i]);
173 irq_desc_ptrs[i] = desc + i; 179 irq_desc_ptrs[i] = desc + i;
174 } 180 }
175 181
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
187 return NULL; 193 return NULL;
188} 194}
189 195
190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 196struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
191{ 197{
192 struct irq_desc *desc; 198 struct irq_desc *desc;
193 unsigned long flags; 199 unsigned long flags;
194 int node;
195 200
196 if (irq >= nr_irqs) { 201 if (irq >= nr_irqs) {
197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", 202 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
210 if (desc) 215 if (desc)
211 goto out_unlock; 216 goto out_unlock;
212 217
213 node = cpu_to_node(cpu); 218 if (slab_is_available())
214 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 219 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
215 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", 220 else
216 irq, cpu, node); 221 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
222
223 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
217 if (!desc) { 224 if (!desc) {
218 printk(KERN_ERR "can not alloc irq_desc\n"); 225 printk(KERN_ERR "can not alloc irq_desc\n");
219 BUG_ON(1); 226 BUG_ON(1);
220 } 227 }
221 init_one_irq_desc(irq, desc, cpu); 228 init_one_irq_desc(irq, desc, node);
222 229
223 irq_desc_ptrs[irq] = desc; 230 irq_desc_ptrs[irq] = desc;
224 231
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
256 263
257 for (i = 0; i < count; i++) { 264 for (i = 0; i < count; i++) {
258 desc[i].irq = i; 265 desc[i].irq = i;
259 init_alloc_desc_masks(&desc[i], 0, true); 266 alloc_desc_masks(&desc[i], 0, true);
267 init_desc_masks(&desc[i]);
260 desc[i].kstat_irqs = kstat_irqs_all[i]; 268 desc[i].kstat_irqs = kstat_irqs_all[i];
261 } 269 }
262 return arch_early_irq_init(); 270 return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
267 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 275 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
268} 276}
269 277
270struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 278struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
271{ 279{
272 return irq_to_desc(irq); 280 return irq_to_desc(irq);
273} 281}
@@ -450,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
450 /* 458 /*
451 * No locking required for CPU-local interrupts: 459 * No locking required for CPU-local interrupts:
452 */ 460 */
453 if (desc->chip->ack) { 461 if (desc->chip->ack)
454 desc->chip->ack(irq); 462 desc->chip->ack(irq);
455 /* get new one */
456 desc = irq_remap_to_desc(irq, desc);
457 }
458 if (likely(!(desc->status & IRQ_DISABLED))) { 463 if (likely(!(desc->status & IRQ_DISABLED))) {
459 action_ret = handle_IRQ_event(irq, desc->action); 464 action_ret = handle_IRQ_event(irq, desc->action);
460 if (!noirqdebug) 465 if (!noirqdebug)
@@ -465,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
465 } 470 }
466 471
467 spin_lock(&desc->lock); 472 spin_lock(&desc->lock);
468 if (desc->chip->ack) { 473 if (desc->chip->ack)
469 desc->chip->ack(irq); 474 desc->chip->ack(irq);
470 desc = irq_remap_to_desc(irq, desc);
471 }
472 /* 475 /*
473 * REPLAY is when Linux resends an IRQ that was dropped earlier 476 * REPLAY is when Linux resends an IRQ that was dropped earlier
474 * WAITING is used by probe to mark irqs that are being tested 477 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..73468253143b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 17
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22 22
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47
45/* 48/*
46 * Debugging printout: 49 * Debugging printout:
47 */ 50 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..aaf5c9d05770 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void 83void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{ 85{
86 struct irqaction *action = desc->action; 86 struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109 spin_lock_irqsave(&desc->lock, flags); 109 spin_lock_irqsave(&desc->lock, flags);
110 110
111#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
112 if (desc->status & IRQ_MOVE_PCNTXT) 112 if (desc->status & IRQ_MOVE_PCNTXT) {
113 desc->chip->set_affinity(irq, cpumask); 113 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask);
116 }
117 }
114 else { 118 else {
115 desc->status |= IRQ_MOVE_PENDING; 119 desc->status |= IRQ_MOVE_PENDING;
116 cpumask_copy(desc->pending_mask, cpumask); 120 cpumask_copy(desc->pending_mask, cpumask);
117 } 121 }
118#else 122#else
119 cpumask_copy(desc->affinity, cpumask); 123 if (!desc->chip->set_affinity(irq, cpumask)) {
120 desc->chip->set_affinity(irq, cpumask); 124 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask);
126 }
121#endif 127#endif
122 irq_set_thread_affinity(desc, cpumask);
123 desc->status |= IRQ_AFFINITY_SET; 128 desc->status |= IRQ_AFFINITY_SET;
124 spin_unlock_irqrestore(&desc->lock, flags); 129 spin_unlock_irqrestore(&desc->lock, flags);
125 return 0; 130 return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..cfe767ca1545 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3#include <linux/interrupt.h>
4
5#include "internals.h"
3 6
4void move_masked_irq(int irq) 7void move_masked_irq(int irq)
5{ 8{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
39 * masking the irqs. 42 * masking the irqs.
40 */ 43 */
41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 45 < nr_cpu_ids))
43 cpumask_and(desc->affinity, 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
44 desc->pending_mask, cpu_online_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
45 desc->chip->set_affinity(irq, desc->affinity); 48 irq_set_thread_affinity(desc, desc->pending_mask);
46 } 49 }
50
47 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
48} 52}
49 53
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..2f69bee57bf2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
15 15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc, 16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int node, int nr)
19{ 19{
20 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, node, nr);
21 21
22 if (desc->kstat_irqs != old_desc->kstat_irqs) 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
34} 34}
35 35
36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
37 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int node)
38{ 38{
39 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) { 40 if (!alloc_desc_masks(desc, node, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " 41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc); 49 init_copy_desc_masks(old_desc, desc);
50 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, node);
51 return true; 51 return true;
52} 52}
53 53
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
59} 59}
60 60
61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, 61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
62 int cpu) 62 int node)
63{ 63{
64 struct irq_desc *desc; 64 struct irq_desc *desc;
65 unsigned int irq; 65 unsigned int irq;
66 unsigned long flags; 66 unsigned long flags;
67 int node;
68 67
69 irq = old_desc->irq; 68 irq = old_desc->irq;
70 69
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
77 goto out_unlock; 76 goto out_unlock;
78 77
79 node = cpu_to_node(cpu);
80 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 78 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
81 if (!desc) { 79 if (!desc) {
82 printk(KERN_ERR "irq %d: can not get new irq_desc " 80 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 desc = old_desc; 83 desc = old_desc;
86 goto out_unlock; 84 goto out_unlock;
87 } 85 }
88 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { 86 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
89 /* still use old one */ 87 /* still use old one */
90 kfree(desc); 88 kfree(desc);
91 desc = old_desc; 89 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
97 95
98 /* free the old one */ 96 /* free the old one */
99 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
100 spin_unlock(&old_desc->lock);
101 kfree(old_desc); 98 kfree(old_desc);
102 spin_lock(&desc->lock);
103 99
104 return desc; 100 return desc;
105 101
@@ -109,24 +105,14 @@ out_unlock:
109 return desc; 105 return desc;
110} 106}
111 107
112struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
113{ 109{
114 int old_cpu;
115 int node, old_node;
116
117 /* those all static, do move them */ 110 /* those all static, do move them */
118 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY)
119 return desc; 112 return desc;
120 113
121 old_cpu = desc->cpu; 114 if (desc->node != node)
122 if (old_cpu != cpu) { 115 desc = __real_move_irq_desc(desc, node);
123 node = cpu_to_node(cpu);
124 old_node = cpu_to_node(old_cpu);
125 if (old_node != node)
126 desc = __real_move_irq_desc(desc, cpu);
127 else
128 desc->cpu = cpu;
129 }
130 116
131 return desc; 117 return desc;
132} 118}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/*
34 * These will be re-linked against their real values
35 * during the second link stage.
36 */
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 37extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak)); 38extern const u8 kallsyms_names[] __attribute__((weak));
36 39
37/* tell the compiler that the count isn't in the small data section if the arch 40/*
38 * has one (eg: FRV) 41 * Tell the compiler that the count isn't in the small data section if the arch
42 * has one (eg: FRV).
39 */ 43 */
40extern const unsigned long kallsyms_num_syms 44extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 45__attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
75 return is_kernel_text(addr) || is_kernel_inittext(addr); 79 return is_kernel_text(addr) || is_kernel_inittext(addr);
76} 80}
77 81
78/* expand a compressed symbol data into the resulting uncompressed string, 82/*
79 given the offset to where the symbol is in the compressed stream */ 83 * Expand a compressed symbol data into the resulting uncompressed string,
84 * given the offset to where the symbol is in the compressed stream.
85 */
80static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 86static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
81{ 87{
82 int len, skipped_first = 0; 88 int len, skipped_first = 0;
83 const u8 *tptr, *data; 89 const u8 *tptr, *data;
84 90
85 /* get the compressed symbol length from the first symbol byte */ 91 /* Get the compressed symbol length from the first symbol byte. */
86 data = &kallsyms_names[off]; 92 data = &kallsyms_names[off];
87 len = *data; 93 len = *data;
88 data++; 94 data++;
89 95
90 /* update the offset to return the offset for the next symbol on 96 /*
91 * the compressed stream */ 97 * Update the offset to return the offset for the next symbol on
98 * the compressed stream.
99 */
92 off += len + 1; 100 off += len + 1;
93 101
94 /* for every byte on the compressed symbol data, copy the table 102 /*
95 entry for that byte */ 103 * For every byte on the compressed symbol data, copy the table
96 while(len) { 104 * entry for that byte.
97 tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; 105 */
106 while (len) {
107 tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
98 data++; 108 data++;
99 len--; 109 len--;
100 110
101 while (*tptr) { 111 while (*tptr) {
102 if(skipped_first) { 112 if (skipped_first) {
103 *result = *tptr; 113 *result = *tptr;
104 result++; 114 result++;
105 } else 115 } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
110 120
111 *result = '\0'; 121 *result = '\0';
112 122
113 /* return to offset to the next symbol */ 123 /* Return to offset to the next symbol. */
114 return off; 124 return off;
115} 125}
116 126
117/* get symbol type information. This is encoded as a single char at the 127/*
118 * begining of the symbol name */ 128 * Get symbol type information. This is encoded as a single char at the
129 * beginning of the symbol name.
130 */
119static char kallsyms_get_symbol_type(unsigned int off) 131static char kallsyms_get_symbol_type(unsigned int off)
120{ 132{
121 /* get just the first code, look it up in the token table, and return the 133 /*
122 * first char from this token */ 134 * Get just the first code, look it up in the token table,
123 return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; 135 * and return the first char from this token.
136 */
137 return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
124} 138}
125 139
126 140
127/* find the offset on the compressed stream given and index in the 141/*
128 * kallsyms array */ 142 * Find the offset on the compressed stream given and index in the
143 * kallsyms array.
144 */
129static unsigned int get_symbol_offset(unsigned long pos) 145static unsigned int get_symbol_offset(unsigned long pos)
130{ 146{
131 const u8 *name; 147 const u8 *name;
132 int i; 148 int i;
133 149
134 /* use the closest marker we have. We have markers every 256 positions, 150 /*
135 * so that should be close enough */ 151 * Use the closest marker we have. We have markers every 256 positions,
136 name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; 152 * so that should be close enough.
153 */
154 name = &kallsyms_names[kallsyms_markers[pos >> 8]];
137 155
138 /* sequentially scan all the symbols up to the point we're searching for. 156 /*
139 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we 157 * Sequentially scan all the symbols up to the point we're searching
140 * just need to add the len to the current pointer for every symbol we 158 * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
141 * wish to skip */ 159 * so we just need to add the len to the current pointer for every
142 for(i = 0; i < (pos&0xFF); i++) 160 * symbol we wish to skip.
161 */
162 for (i = 0; i < (pos & 0xFF); i++)
143 name = name + (*name) + 1; 163 name = name + (*name) + 1;
144 164
145 return name - kallsyms_names; 165 return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
190 /* This kernel should never had been booted. */ 210 /* This kernel should never had been booted. */
191 BUG_ON(!kallsyms_addresses); 211 BUG_ON(!kallsyms_addresses);
192 212
193 /* do a binary search on the sorted kallsyms_addresses array */ 213 /* Do a binary search on the sorted kallsyms_addresses array. */
194 low = 0; 214 low = 0;
195 high = kallsyms_num_syms; 215 high = kallsyms_num_syms;
196 216
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
203 } 223 }
204 224
205 /* 225 /*
206 * search for the first aliased symbol. Aliased 226 * Search for the first aliased symbol. Aliased
207 * symbols are symbols with the same address 227 * symbols are symbols with the same address.
208 */ 228 */
209 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) 229 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
210 --low; 230 --low;
211 231
212 symbol_start = kallsyms_addresses[low]; 232 symbol_start = kallsyms_addresses[low];
213 233
214 /* Search for next non-aliased symbol */ 234 /* Search for next non-aliased symbol. */
215 for (i = low + 1; i < kallsyms_num_syms; i++) { 235 for (i = low + 1; i < kallsyms_num_syms; i++) {
216 if (kallsyms_addresses[i] > symbol_start) { 236 if (kallsyms_addresses[i] > symbol_start) {
217 symbol_end = kallsyms_addresses[i]; 237 symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
219 } 239 }
220 } 240 }
221 241
222 /* if we found no next symbol, we use the end of the section */ 242 /* If we found no next symbol, we use the end of the section. */
223 if (!symbol_end) { 243 if (!symbol_end) {
224 if (is_kernel_inittext(addr)) 244 if (is_kernel_inittext(addr))
225 symbol_end = (unsigned long)_einittext; 245 symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
252 272
253/* 273/*
254 * Lookup an address 274 * Lookup an address
255 * - modname is set to NULL if it's in the kernel 275 * - modname is set to NULL if it's in the kernel.
256 * - we guarantee that the returned name is valid until we reschedule even if 276 * - We guarantee that the returned name is valid until we reschedule even if.
257 * it resides in a module 277 * It resides in a module.
258 * - we also guarantee that modname will be valid until rescheduled 278 * - We also guarantee that modname will be valid until rescheduled.
259 */ 279 */
260const char *kallsyms_lookup(unsigned long addr, 280const char *kallsyms_lookup(unsigned long addr,
261 unsigned long *symbolsize, 281 unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
276 return namebuf; 296 return namebuf;
277 } 297 }
278 298
279 /* see if it's in a module */ 299 /* See if it's in a module. */
280 return module_address_lookup(addr, symbolsize, offset, modname, 300 return module_address_lookup(addr, symbolsize, offset, modname,
281 namebuf); 301 namebuf);
282} 302}
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
294 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 314 kallsyms_expand_symbol(get_symbol_offset(pos), symname);
295 return 0; 315 return 0;
296 } 316 }
297 /* see if it's in a module */ 317 /* See if it's in a module. */
298 return lookup_module_symbol_name(addr, symname); 318 return lookup_module_symbol_name(addr, symname);
299} 319}
300 320
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
313 modname[0] = '\0'; 333 modname[0] = '\0';
314 return 0; 334 return 0;
315 } 335 }
316 /* see if it's in a module */ 336 /* See if it's in a module. */
317 return lookup_module_symbol_attrs(addr, size, offset, modname, name); 337 return lookup_module_symbol_attrs(addr, size, offset, modname, name);
318} 338}
319 339
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
342 362
343 return len; 363 return len;
344} 364}
365EXPORT_SYMBOL_GPL(sprint_symbol);
345 366
346/* Look up a kernel symbol and print it to the kernel messages. */ 367/* Look up a kernel symbol and print it to the kernel messages. */
347void __print_symbol(const char *fmt, unsigned long address) 368void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
352 373
353 printk(fmt, buffer); 374 printk(fmt, buffer);
354} 375}
376EXPORT_SYMBOL(__print_symbol);
355 377
356/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ 378/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
357struct kallsym_iter 379struct kallsym_iter {
358{
359 loff_t pos; 380 loff_t pos;
360 unsigned long value; 381 unsigned long value;
361 unsigned int nameoff; /* If iterating in core kernel symbols */ 382 unsigned int nameoff; /* If iterating in core kernel symbols. */
362 char type; 383 char type;
363 char name[KSYM_NAME_LEN]; 384 char name[KSYM_NAME_LEN];
364 char module_name[MODULE_NAME_LEN]; 385 char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
404 iter->pos = pos; 425 iter->pos = pos;
405 return get_ksymbol_mod(iter); 426 return get_ksymbol_mod(iter);
406 } 427 }
407 428
408 /* If we're not on the desired position, reset to new position. */ 429 /* If we're not on the desired position, reset to new position. */
409 if (pos != iter->pos) 430 if (pos != iter->pos)
410 reset_iter(iter, pos); 431 reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
439{ 460{
440 struct kallsym_iter *iter = m->private; 461 struct kallsym_iter *iter = m->private;
441 462
442 /* Some debugging symbols have no name. Ignore them. */ 463 /* Some debugging symbols have no name. Ignore them. */
443 if (!iter->name[0]) 464 if (!iter->name[0])
444 return 0; 465 return 0;
445 466
446 if (iter->module_name[0]) { 467 if (iter->module_name[0]) {
447 char type; 468 char type;
448 469
449 /* Label it "global" if it is exported, 470 /*
450 * "local" if not exported. */ 471 * Label it "global" if it is exported,
472 * "local" if not exported.
473 */
451 type = iter->exported ? toupper(iter->type) : 474 type = iter->exported ? toupper(iter->type) :
452 tolower(iter->type); 475 tolower(iter->type);
453 seq_printf(m, "%0*lx %c %s\t[%s]\n", 476 seq_printf(m, "%0*lx %c %s\t[%s]\n",
454 (int)(2*sizeof(void*)), 477 (int)(2 * sizeof(void *)),
455 iter->value, type, iter->name, iter->module_name); 478 iter->value, type, iter->name, iter->module_name);
456 } else 479 } else
457 seq_printf(m, "%0*lx %c %s\n", 480 seq_printf(m, "%0*lx %c %s\n",
458 (int)(2*sizeof(void*)), 481 (int)(2 * sizeof(void *)),
459 iter->value, iter->type, iter->name); 482 iter->value, iter->type, iter->name);
460 return 0; 483 return 0;
461} 484}
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
469 492
470static int kallsyms_open(struct inode *inode, struct file *file) 493static int kallsyms_open(struct inode *inode, struct file *file)
471{ 494{
472 /* We keep iterator in m->private, since normal case is to 495 /*
496 * We keep iterator in m->private, since normal case is to
473 * s_start from where we left off, so we avoid doing 497 * s_start from where we left off, so we avoid doing
474 * using get_symbol_offset for every symbol */ 498 * using get_symbol_offset for every symbol.
499 */
475 struct kallsym_iter *iter; 500 struct kallsym_iter *iter;
476 int ret; 501 int ret;
477 502
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
500 proc_create("kallsyms", 0444, NULL, &kallsyms_operations); 525 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
501 return 0; 526 return 0;
502} 527}
503__initcall(kallsyms_init); 528device_initcall(kallsyms_init);
504
505EXPORT_SYMBOL(__print_symbol);
506EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5a758c6e4950..ae1c35201cc8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1448,18 +1448,17 @@ int kernel_kexec(void)
1448 goto Restore_console; 1448 goto Restore_console;
1449 } 1449 }
1450 suspend_console(); 1450 suspend_console();
1451 error = device_suspend(PMSG_FREEZE); 1451 error = dpm_suspend_start(PMSG_FREEZE);
1452 if (error) 1452 if (error)
1453 goto Resume_console; 1453 goto Resume_console;
1454 device_pm_lock(); 1454 /* At this point, dpm_suspend_start() has been called,
1455 /* At this point, device_suspend() has been called, 1455 * but *not* dpm_suspend_noirq(). We *must* call
1456 * but *not* device_power_down(). We *must* 1456 * dpm_suspend_noirq() now. Otherwise, drivers for
1457 * device_power_down() now. Otherwise, drivers for
1458 * some devices (e.g. interrupt controllers) become 1457 * some devices (e.g. interrupt controllers) become
1459 * desynchronized with the actual state of the 1458 * desynchronized with the actual state of the
1460 * hardware at resume time, and evil weirdness ensues. 1459 * hardware at resume time, and evil weirdness ensues.
1461 */ 1460 */
1462 error = device_power_down(PMSG_FREEZE); 1461 error = dpm_suspend_noirq(PMSG_FREEZE);
1463 if (error) 1462 if (error)
1464 goto Resume_devices; 1463 goto Resume_devices;
1465 error = disable_nonboot_cpus(); 1464 error = disable_nonboot_cpus();
@@ -1487,10 +1486,9 @@ int kernel_kexec(void)
1487 local_irq_enable(); 1486 local_irq_enable();
1488 Enable_cpus: 1487 Enable_cpus:
1489 enable_nonboot_cpus(); 1488 enable_nonboot_cpus();
1490 device_power_up(PMSG_RESTORE); 1489 dpm_resume_noirq(PMSG_RESTORE);
1491 Resume_devices: 1490 Resume_devices:
1492 device_pm_unlock(); 1491 dpm_resume_end(PMSG_RESTORE);
1493 device_resume(PMSG_RESTORE);
1494 Resume_console: 1492 Resume_console:
1495 resume_console(); 1493 resume_console();
1496 thaw_processes(); 1494 thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
72 72
73 /* 73 /*
74 * round up to the next power of 2, since our 'let the indices 74 * round up to the next power of 2, since our 'let the indices
75 * wrap' tachnique works only in this case. 75 * wrap' technique works only in this case.
76 */ 76 */
77 if (size & (size - 1)) { 77 if (!is_power_of_2(size)) {
78 BUG_ON(size > 0x80000000); 78 BUG_ON(size > 0x80000000);
79 size = roundup_pow_of_two(size); 79 size = roundup_pow_of_two(size);
80 } 80 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b750675251e5..7e95bedb2bfc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
370 sub_info->argv = argv; 370 sub_info->argv = argv;
371 sub_info->envp = envp; 371 sub_info->envp = envp;
372 sub_info->cred = prepare_usermodehelper_creds(); 372 sub_info->cred = prepare_usermodehelper_creds();
373 if (!sub_info->cred) 373 if (!sub_info->cred) {
374 kfree(sub_info);
374 return NULL; 375 return NULL;
376 }
375 377
376 out: 378 out:
377 return sub_info; 379 return sub_info;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
236 ignore_signals(tsk); 237 ignore_signals(tsk);
237 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 238 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 set_cpus_allowed_ptr(tsk, cpu_all_mask); 239 set_cpus_allowed_ptr(tsk, cpu_all_mask);
240 set_mems_allowed(node_possible_map);
239 241
240 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 243
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2cc7e9a6e84..699a2ac3a0d7 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 54 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 55 * conflicts and deadlocks.
56 */ 56 */
57#define MAX_LOCKDEP_ENTRIES 8192UL 57#define MAX_LOCKDEP_ENTRIES 16384UL
58 58
59#define MAX_LOCKDEP_CHAINS_BITS 14 59#define MAX_LOCKDEP_CHAINS_BITS 15
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61 61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) 62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
diff --git a/kernel/module.c b/kernel/module.c
index 2383e60fcf3f..215aaab09e91 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,6 +53,7 @@
53#include <linux/ftrace.h> 53#include <linux/ftrace.h>
54#include <linux/async.h> 54#include <linux/async.h>
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h>
56 57
57#if 0 58#if 0
58#define DEBUGP printk 59#define DEBUGP printk
@@ -73,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
73EXPORT_SYMBOL_GPL(module_mutex); 74EXPORT_SYMBOL_GPL(module_mutex);
74static LIST_HEAD(modules); 75static LIST_HEAD(modules);
75 76
77/* Block module loading/unloading? */
78int modules_disabled = 0;
79
76/* Waiting for a module to finish initializing? */ 80/* Waiting for a module to finish initializing? */
77static DECLARE_WAIT_QUEUE_HEAD(module_wq); 81static DECLARE_WAIT_QUEUE_HEAD(module_wq);
78 82
@@ -430,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
430 unsigned long extra; 434 unsigned long extra;
431 unsigned int i; 435 unsigned int i;
432 void *ptr; 436 void *ptr;
437 int cpu;
433 438
434 if (align > PAGE_SIZE) { 439 if (align > PAGE_SIZE) {
435 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 440 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -459,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
459 if (!split_block(i, size)) 464 if (!split_block(i, size))
460 return NULL; 465 return NULL;
461 466
467 /* add the per-cpu scanning areas */
468 for_each_possible_cpu(cpu)
469 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
470 GFP_KERNEL);
471
462 /* Mark allocated */ 472 /* Mark allocated */
463 pcpu_size[i] = -pcpu_size[i]; 473 pcpu_size[i] = -pcpu_size[i];
464 return ptr; 474 return ptr;
@@ -473,6 +483,7 @@ static void percpu_modfree(void *freeme)
473{ 483{
474 unsigned int i; 484 unsigned int i;
475 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 485 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
486 int cpu;
476 487
477 /* First entry is core kernel percpu data. */ 488 /* First entry is core kernel percpu data. */
478 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 489 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -484,6 +495,10 @@ static void percpu_modfree(void *freeme)
484 BUG(); 495 BUG();
485 496
486 free: 497 free:
498 /* remove the per-cpu scanning areas */
499 for_each_possible_cpu(cpu)
500 kmemleak_free(freeme + per_cpu_offset(cpu));
501
487 /* Merge with previous? */ 502 /* Merge with previous? */
488 if (pcpu_size[i-1] >= 0) { 503 if (pcpu_size[i-1] >= 0) {
489 pcpu_size[i-1] += pcpu_size[i]; 504 pcpu_size[i-1] += pcpu_size[i];
@@ -778,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
778 char name[MODULE_NAME_LEN]; 793 char name[MODULE_NAME_LEN];
779 int ret, forced = 0; 794 int ret, forced = 0;
780 795
781 if (!capable(CAP_SYS_MODULE)) 796 if (!capable(CAP_SYS_MODULE) || modules_disabled)
782 return -EPERM; 797 return -EPERM;
783 798
784 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) 799 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -1876,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size)
1876 return ret; 1891 return ret;
1877} 1892}
1878 1893
1894#ifdef CONFIG_DEBUG_KMEMLEAK
1895static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1896 Elf_Shdr *sechdrs, char *secstrings)
1897{
1898 unsigned int i;
1899
1900 /* only scan the sections containing data */
1901 kmemleak_scan_area(mod->module_core, (unsigned long)mod -
1902 (unsigned long)mod->module_core,
1903 sizeof(struct module), GFP_KERNEL);
1904
1905 for (i = 1; i < hdr->e_shnum; i++) {
1906 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1907 continue;
1908 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
1909 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1910 continue;
1911
1912 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
1913 (unsigned long)mod->module_core,
1914 sechdrs[i].sh_size, GFP_KERNEL);
1915 }
1916}
1917#else
1918static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1919 Elf_Shdr *sechdrs, char *secstrings)
1920{
1921}
1922#endif
1923
1879/* Allocate and load the module: note that size of section 0 is always 1924/* Allocate and load the module: note that size of section 0 is always
1880 zero, and we rely on this for optional sections. */ 1925 zero, and we rely on this for optional sections. */
1881static noinline struct module *load_module(void __user *umod, 1926static noinline struct module *load_module(void __user *umod,
@@ -2046,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod,
2046 2091
2047 /* Do the allocs. */ 2092 /* Do the allocs. */
2048 ptr = module_alloc_update_bounds(mod->core_size); 2093 ptr = module_alloc_update_bounds(mod->core_size);
2094 /*
2095 * The pointer to this block is stored in the module structure
2096 * which is inside the block. Just mark it as not being a
2097 * leak.
2098 */
2099 kmemleak_not_leak(ptr);
2049 if (!ptr) { 2100 if (!ptr) {
2050 err = -ENOMEM; 2101 err = -ENOMEM;
2051 goto free_percpu; 2102 goto free_percpu;
@@ -2054,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod,
2054 mod->module_core = ptr; 2105 mod->module_core = ptr;
2055 2106
2056 ptr = module_alloc_update_bounds(mod->init_size); 2107 ptr = module_alloc_update_bounds(mod->init_size);
2108 /*
2109 * The pointer to this block is stored in the module structure
2110 * which is inside the block. This block doesn't need to be
2111 * scanned as it contains data and code that will be freed
2112 * after the module is initialized.
2113 */
2114 kmemleak_ignore(ptr);
2057 if (!ptr && mod->init_size) { 2115 if (!ptr && mod->init_size) {
2058 err = -ENOMEM; 2116 err = -ENOMEM;
2059 goto free_core; 2117 goto free_core;
@@ -2084,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod,
2084 } 2142 }
2085 /* Module has been moved. */ 2143 /* Module has been moved. */
2086 mod = (void *)sechdrs[modindex].sh_addr; 2144 mod = (void *)sechdrs[modindex].sh_addr;
2145 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2087 2146
2088#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2147#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2089 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2148 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2338,7 +2397,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2338 int ret = 0; 2397 int ret = 0;
2339 2398
2340 /* Must have permission */ 2399 /* Must have permission */
2341 if (!capable(CAP_SYS_MODULE)) 2400 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2342 return -EPERM; 2401 return -EPERM;
2343 2402
2344 /* Only one module load at a time, please */ 2403 /* Only one module load at a time, please */
@@ -2396,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2396 mutex_lock(&module_mutex); 2455 mutex_lock(&module_mutex);
2397 /* Drop initial reference. */ 2456 /* Drop initial reference. */
2398 module_put(mod); 2457 module_put(mod);
2458 trim_init_extable(mod);
2399 module_free(mod, mod->module_init); 2459 module_free(mod, mod->module_init);
2400 mod->module_init = NULL; 2460 mod->module_init = NULL;
2401 mod->init_size = 0; 2461 mod->init_size = 0;
@@ -2839,7 +2899,7 @@ void print_modules(void)
2839 struct module *mod; 2899 struct module *mod;
2840 char buf[8]; 2900 char buf[8];
2841 2901
2842 printk("Modules linked in:"); 2902 printk(KERN_DEFAULT "Modules linked in:");
2843 /* Most callers should already have preempt disabled, but make sure */ 2903 /* Most callers should already have preempt disabled, but make sure */
2844 preempt_disable(); 2904 preempt_disable();
2845 list_for_each_entry_rcu(mod, &modules, list) 2905 list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 473
472 return ret; 474 return ret;
473} 475}
474
475EXPORT_SYMBOL(mutex_trylock); 476EXPORT_SYMBOL(mutex_trylock);
477
478/**
479 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
480 * @cnt: the atomic which we are to dec
481 * @lock: the mutex to return holding if we dec to 0
482 *
483 * return true and hold lock if we dec to 0, return false otherwise
484 */
485int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
486{
487 /* dec if we can't possibly hit 0 */
488 if (atomic_add_unless(cnt, -1, 1))
489 return 0;
490 /* we might hit 0, so take the lock */
491 mutex_lock(lock);
492 if (!atomic_dec_and_test(cnt)) {
493 /* when we actually did the dec, we didn't hit 0 */
494 mutex_unlock(lock);
495 return 0;
496 }
497 /* we hit 0, and we hold the lock */
498 return 1;
499}
500EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/panic.c b/kernel/panic.c
index 874ecf1307ae..984b3ecbd72c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -340,39 +340,44 @@ void oops_exit(void)
340} 340}
341 341
342#ifdef WANT_WARN_ON_SLOWPATH 342#ifdef WANT_WARN_ON_SLOWPATH
343void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 343struct slowpath_args {
344{ 344 const char *fmt;
345 va_list args; 345 va_list args;
346 char function[KSYM_SYMBOL_LEN]; 346};
347 unsigned long caller = (unsigned long)__builtin_return_address(0);
348 const char *board;
349 347
350 sprint_symbol(function, caller); 348static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
349{
350 const char *board;
351 351
352 printk(KERN_WARNING "------------[ cut here ]------------\n"); 352 printk(KERN_WARNING "------------[ cut here ]------------\n");
353 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 353 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
354 line, function);
355 board = dmi_get_system_info(DMI_PRODUCT_NAME); 354 board = dmi_get_system_info(DMI_PRODUCT_NAME);
356 if (board) 355 if (board)
357 printk(KERN_WARNING "Hardware name: %s\n", board); 356 printk(KERN_WARNING "Hardware name: %s\n", board);
358 357
359 if (*fmt) { 358 if (args)
360 va_start(args, fmt); 359 vprintk(args->fmt, args->args);
361 vprintk(fmt, args);
362 va_end(args);
363 }
364 360
365 print_modules(); 361 print_modules();
366 dump_stack(); 362 dump_stack();
367 print_oops_end_marker(); 363 print_oops_end_marker();
368 add_taint(TAINT_WARN); 364 add_taint(TAINT_WARN);
369} 365}
366
367void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
368{
369 struct slowpath_args args;
370
371 args.fmt = fmt;
372 va_start(args.args, fmt);
373 warn_slowpath_common(file, line, __builtin_return_address(0), &args);
374 va_end(args.args);
375}
370EXPORT_SYMBOL(warn_slowpath_fmt); 376EXPORT_SYMBOL(warn_slowpath_fmt);
371 377
372void warn_slowpath_null(const char *file, int line) 378void warn_slowpath_null(const char *file, int line)
373{ 379{
374 static const char *empty = ""; 380 warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
375 warn_slowpath_fmt(file, line, empty);
376} 381}
377EXPORT_SYMBOL(warn_slowpath_null); 382EXPORT_SYMBOL(warn_slowpath_null);
378#endif 383#endif
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd2..7f6912ced2ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
30#if 0 27#if 0
31#define DEBUGP printk 28#define DEBUGP printk
32#else 29#else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
220 return -ENOSPC; 217 return -ENOSPC;
221 } 218 }
222 219
223 if (kp->perm & KPARAM_KMALLOCED) 220 if (kp->flags & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg); 221 kfree(*(char **)kp->arg);
225 222
226 /* This is a hack. We can't need to strdup in early boot, and we 223 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */ 224 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) { 225 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED; 226 kp->flags |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg) 228 if (!kp->arg)
232 return -ENOMEM; 229 return -ENOMEM;
@@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
241 return sprintf(buffer, "%s", *((char **)kp->arg)); 238 return sprintf(buffer, "%s", *((char **)kp->arg));
242} 239}
243 240
241/* Actually could be a bool or an int, for historical reasons. */
244int param_set_bool(const char *val, struct kernel_param *kp) 242int param_set_bool(const char *val, struct kernel_param *kp)
245{ 243{
244 bool v;
245
246 /* No equals means "set"... */ 246 /* No equals means "set"... */
247 if (!val) val = "1"; 247 if (!val) val = "1";
248 248
249 /* One of =[yYnN01] */ 249 /* One of =[yYnN01] */
250 switch (val[0]) { 250 switch (val[0]) {
251 case 'y': case 'Y': case '1': 251 case 'y': case 'Y': case '1':
252 *(int *)kp->arg = 1; 252 v = true;
253 return 0; 253 break;
254 case 'n': case 'N': case '0': 254 case 'n': case 'N': case '0':
255 *(int *)kp->arg = 0; 255 v = false;
256 return 0; 256 break;
257 default:
258 return -EINVAL;
257 } 259 }
258 return -EINVAL; 260
261 if (kp->flags & KPARAM_ISBOOL)
262 *(bool *)kp->arg = v;
263 else
264 *(int *)kp->arg = v;
265 return 0;
259} 266}
260 267
261int param_get_bool(char *buffer, struct kernel_param *kp) 268int param_get_bool(char *buffer, struct kernel_param *kp)
262{ 269{
270 bool val;
271 if (kp->flags & KPARAM_ISBOOL)
272 val = *(bool *)kp->arg;
273 else
274 val = *(int *)kp->arg;
275
263 /* Y and N chosen as being relatively non-coder friendly */ 276 /* Y and N chosen as being relatively non-coder friendly */
264 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); 277 return sprintf(buffer, "%c", val ? 'Y' : 'N');
265} 278}
266 279
280/* This one must be bool. */
267int param_set_invbool(const char *val, struct kernel_param *kp) 281int param_set_invbool(const char *val, struct kernel_param *kp)
268{ 282{
269 int boolval, ret; 283 int ret;
284 bool boolval;
270 struct kernel_param dummy; 285 struct kernel_param dummy;
271 286
272 dummy.arg = &boolval; 287 dummy.arg = &boolval;
288 dummy.flags = KPARAM_ISBOOL;
273 ret = param_set_bool(val, &dummy); 289 ret = param_set_bool(val, &dummy);
274 if (ret == 0) 290 if (ret == 0)
275 *(int *)kp->arg = !boolval; 291 *(bool *)kp->arg = !boolval;
276 return ret; 292 return ret;
277} 293}
278 294
279int param_get_invbool(char *buffer, struct kernel_param *kp) 295int param_get_invbool(char *buffer, struct kernel_param *kp)
280{ 296{
281 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); 297 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
282} 298}
283 299
284/* We break the rule and mangle the string. */ 300/* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
591 unsigned int i; 607 unsigned int i;
592 608
593 for (i = 0; i < num; i++) 609 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED) 610 if (params[i].flags & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg); 611 kfree(*(char **)params[i].arg);
596} 612}
597 613
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..29b685f551aa
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4339 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45
46/*
47 * perf counter paranoia level:
48 * 0 - not paranoid
49 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv
51 */
52int sysctl_perf_counter_paranoid __read_mostly;
53
54static inline bool perf_paranoid_cpu(void)
55{
56 return sysctl_perf_counter_paranoid > 0;
57}
58
59static inline bool perf_paranoid_kernel(void)
60{
61 return sysctl_perf_counter_paranoid > 1;
62}
63
64int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
65
66/*
67 * max perf counter sample rate
68 */
69int sysctl_perf_counter_sample_rate __read_mostly = 100000;
70
71static atomic64_t perf_counter_id;
72
73/*
74 * Lock for (sysadmin-configurable) counter reservations:
75 */
76static DEFINE_SPINLOCK(perf_resource_lock);
77
78/*
79 * Architecture provided APIs - weak aliases:
80 */
81extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
82{
83 return NULL;
84}
85
86void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); }
88
89void __weak hw_perf_counter_setup(int cpu) { barrier(); }
90
91int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader,
93 struct perf_cpu_context *cpuctx,
94 struct perf_counter_context *ctx, int cpu)
95{
96 return 0;
97}
98
99void __weak perf_counter_print_debug(void) { }
100
101static DEFINE_PER_CPU(int, disable_count);
102
103void __perf_disable(void)
104{
105 __get_cpu_var(disable_count)++;
106}
107
108bool __perf_enable(void)
109{
110 return !--__get_cpu_var(disable_count);
111}
112
113void perf_disable(void)
114{
115 __perf_disable();
116 hw_perf_disable();
117}
118
119void perf_enable(void)
120{
121 if (__perf_enable())
122 hw_perf_enable();
123}
124
125static void get_ctx(struct perf_counter_context *ctx)
126{
127 atomic_inc(&ctx->refcount);
128}
129
130static void free_ctx(struct rcu_head *head)
131{
132 struct perf_counter_context *ctx;
133
134 ctx = container_of(head, struct perf_counter_context, rcu_head);
135 kfree(ctx);
136}
137
138static void put_ctx(struct perf_counter_context *ctx)
139{
140 if (atomic_dec_and_test(&ctx->refcount)) {
141 if (ctx->parent_ctx)
142 put_ctx(ctx->parent_ctx);
143 if (ctx->task)
144 put_task_struct(ctx->task);
145 call_rcu(&ctx->rcu_head, free_ctx);
146 }
147}
148
149/*
150 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked,
152 * the context could get moved to another task.
153 */
154static struct perf_counter_context *
155perf_lock_task_context(struct task_struct *task, unsigned long *flags)
156{
157 struct perf_counter_context *ctx;
158
159 rcu_read_lock();
160 retry:
161 ctx = rcu_dereference(task->perf_counter_ctxp);
162 if (ctx) {
163 /*
164 * If this context is a clone of another, it might
165 * get swapped for another underneath us by
166 * perf_counter_task_sched_out, though the
167 * rcu_read_lock() protects us from any context
168 * getting freed. Lock the context and check if it
169 * got swapped before we could get the lock, and retry
170 * if so. If we locked the right context, then it
171 * can't get swapped on us any more.
172 */
173 spin_lock_irqsave(&ctx->lock, *flags);
174 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry;
177 }
178 }
179 rcu_read_unlock();
180 return ctx;
181}
182
183/*
184 * Get the context for a task and increment its pin_count so it
185 * can't get swapped to another task. This also increments its
186 * reference count so that the context can't get freed.
187 */
188static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
189{
190 struct perf_counter_context *ctx;
191 unsigned long flags;
192
193 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) {
195 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags);
198 }
199 return ctx;
200}
201
202static void perf_unpin_context(struct perf_counter_context *ctx)
203{
204 unsigned long flags;
205
206 spin_lock_irqsave(&ctx->lock, flags);
207 --ctx->pin_count;
208 spin_unlock_irqrestore(&ctx->lock, flags);
209 put_ctx(ctx);
210}
211
212/*
213 * Add a counter from the lists for its context.
214 * Must be called with ctx->mutex and ctx->lock held.
215 */
216static void
217list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
218{
219 struct perf_counter *group_leader = counter->group_leader;
220
221 /*
222 * Depending on whether it is a standalone or sibling counter,
223 * add it straight to the context's counter list, or to the group
224 * leader's sibling list:
225 */
226 if (group_leader == counter)
227 list_add_tail(&counter->list_entry, &ctx->counter_list);
228 else {
229 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
230 group_leader->nr_siblings++;
231 }
232
233 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++;
235}
236
237/*
238 * Remove a counter from the lists for its context.
239 * Must be called with ctx->mutex and ctx->lock held.
240 */
241static void
242list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
243{
244 struct perf_counter *sibling, *tmp;
245
246 if (list_empty(&counter->list_entry))
247 return;
248 ctx->nr_counters--;
249
250 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry);
252
253 if (counter->group_leader != counter)
254 counter->group_leader->nr_siblings--;
255
256 /*
257 * If this was a group counter with sibling counters then
258 * upgrade the siblings to singleton counters by adding them
259 * to the context list directly:
260 */
261 list_for_each_entry_safe(sibling, tmp,
262 &counter->sibling_list, list_entry) {
263
264 list_move_tail(&sibling->list_entry, &ctx->counter_list);
265 sibling->group_leader = sibling;
266 }
267}
268
269static void
270counter_sched_out(struct perf_counter *counter,
271 struct perf_cpu_context *cpuctx,
272 struct perf_counter_context *ctx)
273{
274 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
275 return;
276
277 counter->state = PERF_COUNTER_STATE_INACTIVE;
278 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter);
280 counter->oncpu = -1;
281
282 if (!is_software_counter(counter))
283 cpuctx->active_oncpu--;
284 ctx->nr_active--;
285 if (counter->attr.exclusive || !cpuctx->active_oncpu)
286 cpuctx->exclusive = 0;
287}
288
289static void
290group_sched_out(struct perf_counter *group_counter,
291 struct perf_cpu_context *cpuctx,
292 struct perf_counter_context *ctx)
293{
294 struct perf_counter *counter;
295
296 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
297 return;
298
299 counter_sched_out(group_counter, cpuctx, ctx);
300
301 /*
302 * Schedule out siblings (if any):
303 */
304 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
305 counter_sched_out(counter, cpuctx, ctx);
306
307 if (group_counter->attr.exclusive)
308 cpuctx->exclusive = 0;
309}
310
311/*
312 * Cross CPU call to remove a performance counter
313 *
314 * We disable the counter on the hardware level first. After that we
315 * remove it from the context list.
316 */
317static void __perf_counter_remove_from_context(void *info)
318{
319 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
320 struct perf_counter *counter = info;
321 struct perf_counter_context *ctx = counter->ctx;
322
323 /*
324 * If this is a task context, we need to check whether it is
325 * the current task context of this cpu. If not it has been
326 * scheduled out before the smp call arrived.
327 */
328 if (ctx->task && cpuctx->task_ctx != ctx)
329 return;
330
331 spin_lock(&ctx->lock);
332 /*
333 * Protect the list operation against NMI by disabling the
334 * counters on a global level.
335 */
336 perf_disable();
337
338 counter_sched_out(counter, cpuctx, ctx);
339
340 list_del_counter(counter, ctx);
341
342 if (!ctx->task) {
343 /*
344 * Allow more per task counters with respect to the
345 * reservation:
346 */
347 cpuctx->max_pertask =
348 min(perf_max_counters - ctx->nr_counters,
349 perf_max_counters - perf_reserved_percpu);
350 }
351
352 perf_enable();
353 spin_unlock(&ctx->lock);
354}
355
356
357/*
358 * Remove the counter from a task's (or a CPU's) list of counters.
359 *
360 * Must be called with ctx->mutex held.
361 *
362 * CPU counters are removed with a smp call. For task counters we only
363 * call when the task is on a CPU.
364 *
365 * If counter->ctx is a cloned context, callers must make sure that
366 * every task struct that counter->ctx->task could possibly point to
367 * remains valid. This is OK when called from perf_release since
368 * that only calls us on the top-level context, which can't be a clone.
369 * When called from perf_counter_exit_task, it's OK because the
370 * context has been detached from its task.
371 */
372static void perf_counter_remove_from_context(struct perf_counter *counter)
373{
374 struct perf_counter_context *ctx = counter->ctx;
375 struct task_struct *task = ctx->task;
376
377 if (!task) {
378 /*
379 * Per cpu counters are removed via an smp call and
380 * the removal is always sucessful.
381 */
382 smp_call_function_single(counter->cpu,
383 __perf_counter_remove_from_context,
384 counter, 1);
385 return;
386 }
387
388retry:
389 task_oncpu_function_call(task, __perf_counter_remove_from_context,
390 counter);
391
392 spin_lock_irq(&ctx->lock);
393 /*
394 * If the context is active we need to retry the smp call.
395 */
396 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
397 spin_unlock_irq(&ctx->lock);
398 goto retry;
399 }
400
401 /*
402 * The lock prevents that this context is scheduled in so we
403 * can remove the counter safely, if the call above did not
404 * succeed.
405 */
406 if (!list_empty(&counter->list_entry)) {
407 list_del_counter(counter, ctx);
408 }
409 spin_unlock_irq(&ctx->lock);
410}
411
412static inline u64 perf_clock(void)
413{
414 return cpu_clock(smp_processor_id());
415}
416
417/*
418 * Update the record of the current time in a context.
419 */
420static void update_context_time(struct perf_counter_context *ctx)
421{
422 u64 now = perf_clock();
423
424 ctx->time += now - ctx->timestamp;
425 ctx->timestamp = now;
426}
427
428/*
429 * Update the total_time_enabled and total_time_running fields for a counter.
430 */
431static void update_counter_times(struct perf_counter *counter)
432{
433 struct perf_counter_context *ctx = counter->ctx;
434 u64 run_end;
435
436 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
437 return;
438
439 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
440
441 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
442 run_end = counter->tstamp_stopped;
443 else
444 run_end = ctx->time;
445
446 counter->total_time_running = run_end - counter->tstamp_running;
447}
448
449/*
450 * Update total_time_enabled and total_time_running for all counters in a group.
451 */
452static void update_group_times(struct perf_counter *leader)
453{
454 struct perf_counter *counter;
455
456 update_counter_times(leader);
457 list_for_each_entry(counter, &leader->sibling_list, list_entry)
458 update_counter_times(counter);
459}
460
461/*
462 * Cross CPU call to disable a performance counter
463 */
464static void __perf_counter_disable(void *info)
465{
466 struct perf_counter *counter = info;
467 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
468 struct perf_counter_context *ctx = counter->ctx;
469
470 /*
471 * If this is a per-task counter, need to check whether this
472 * counter's task is the current task on this cpu.
473 */
474 if (ctx->task && cpuctx->task_ctx != ctx)
475 return;
476
477 spin_lock(&ctx->lock);
478
479 /*
480 * If the counter is on, turn it off.
481 * If it is in error state, leave it in error state.
482 */
483 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
484 update_context_time(ctx);
485 update_counter_times(counter);
486 if (counter == counter->group_leader)
487 group_sched_out(counter, cpuctx, ctx);
488 else
489 counter_sched_out(counter, cpuctx, ctx);
490 counter->state = PERF_COUNTER_STATE_OFF;
491 }
492
493 spin_unlock(&ctx->lock);
494}
495
496/*
497 * Disable a counter.
498 *
499 * If counter->ctx is a cloned context, callers must make sure that
500 * every task struct that counter->ctx->task could possibly point to
501 * remains valid. This condition is satisifed when called through
502 * perf_counter_for_each_child or perf_counter_for_each because they
503 * hold the top-level counter's child_mutex, so any descendant that
504 * goes to exit will block in sync_child_counter.
505 * When called from perf_pending_counter it's OK because counter->ctx
506 * is the current context on this CPU and preemption is disabled,
507 * hence we can't get into perf_counter_task_sched_out for this context.
508 */
509static void perf_counter_disable(struct perf_counter *counter)
510{
511 struct perf_counter_context *ctx = counter->ctx;
512 struct task_struct *task = ctx->task;
513
514 if (!task) {
515 /*
516 * Disable the counter on the cpu that it's on
517 */
518 smp_call_function_single(counter->cpu, __perf_counter_disable,
519 counter, 1);
520 return;
521 }
522
523 retry:
524 task_oncpu_function_call(task, __perf_counter_disable, counter);
525
526 spin_lock_irq(&ctx->lock);
527 /*
528 * If the counter is still active, we need to retry the cross-call.
529 */
530 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
531 spin_unlock_irq(&ctx->lock);
532 goto retry;
533 }
534
535 /*
536 * Since we have the lock this context can't be scheduled
537 * in, so we can change the state safely.
538 */
539 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
540 update_counter_times(counter);
541 counter->state = PERF_COUNTER_STATE_OFF;
542 }
543
544 spin_unlock_irq(&ctx->lock);
545}
546
547static int
548counter_sched_in(struct perf_counter *counter,
549 struct perf_cpu_context *cpuctx,
550 struct perf_counter_context *ctx,
551 int cpu)
552{
553 if (counter->state <= PERF_COUNTER_STATE_OFF)
554 return 0;
555
556 counter->state = PERF_COUNTER_STATE_ACTIVE;
557 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
558 /*
559 * The new state must be visible before we turn it on in the hardware:
560 */
561 smp_wmb();
562
563 if (counter->pmu->enable(counter)) {
564 counter->state = PERF_COUNTER_STATE_INACTIVE;
565 counter->oncpu = -1;
566 return -EAGAIN;
567 }
568
569 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
570
571 if (!is_software_counter(counter))
572 cpuctx->active_oncpu++;
573 ctx->nr_active++;
574
575 if (counter->attr.exclusive)
576 cpuctx->exclusive = 1;
577
578 return 0;
579}
580
581static int
582group_sched_in(struct perf_counter *group_counter,
583 struct perf_cpu_context *cpuctx,
584 struct perf_counter_context *ctx,
585 int cpu)
586{
587 struct perf_counter *counter, *partial_group;
588 int ret;
589
590 if (group_counter->state == PERF_COUNTER_STATE_OFF)
591 return 0;
592
593 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
594 if (ret)
595 return ret < 0 ? ret : 0;
596
597 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
598 return -EAGAIN;
599
600 /*
601 * Schedule in siblings as one group (if any):
602 */
603 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
604 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
605 partial_group = counter;
606 goto group_error;
607 }
608 }
609
610 return 0;
611
612group_error:
613 /*
614 * Groups can be scheduled in as one unit only, so undo any
615 * partial group before returning:
616 */
617 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
618 if (counter == partial_group)
619 break;
620 counter_sched_out(counter, cpuctx, ctx);
621 }
622 counter_sched_out(group_counter, cpuctx, ctx);
623
624 return -EAGAIN;
625}
626
627/*
628 * Return 1 for a group consisting entirely of software counters,
629 * 0 if the group contains any hardware counters.
630 */
631static int is_software_only_group(struct perf_counter *leader)
632{
633 struct perf_counter *counter;
634
635 if (!is_software_counter(leader))
636 return 0;
637
638 list_for_each_entry(counter, &leader->sibling_list, list_entry)
639 if (!is_software_counter(counter))
640 return 0;
641
642 return 1;
643}
644
645/*
646 * Work out whether we can put this counter group on the CPU now.
647 */
648static int group_can_go_on(struct perf_counter *counter,
649 struct perf_cpu_context *cpuctx,
650 int can_add_hw)
651{
652 /*
653 * Groups consisting entirely of software counters can always go on.
654 */
655 if (is_software_only_group(counter))
656 return 1;
657 /*
658 * If an exclusive group is already on, no other hardware
659 * counters can go on.
660 */
661 if (cpuctx->exclusive)
662 return 0;
663 /*
664 * If this group is exclusive and there are already
665 * counters on the CPU, it can't go on.
666 */
667 if (counter->attr.exclusive && cpuctx->active_oncpu)
668 return 0;
669 /*
670 * Otherwise, try to add it if all previous groups were able
671 * to go on.
672 */
673 return can_add_hw;
674}
675
676static void add_counter_to_ctx(struct perf_counter *counter,
677 struct perf_counter_context *ctx)
678{
679 list_add_counter(counter, ctx);
680 counter->tstamp_enabled = ctx->time;
681 counter->tstamp_running = ctx->time;
682 counter->tstamp_stopped = ctx->time;
683}
684
685/*
686 * Cross CPU call to install and enable a performance counter
687 *
688 * Must be called with ctx->mutex held
689 */
690static void __perf_install_in_context(void *info)
691{
692 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
693 struct perf_counter *counter = info;
694 struct perf_counter_context *ctx = counter->ctx;
695 struct perf_counter *leader = counter->group_leader;
696 int cpu = smp_processor_id();
697 int err;
698
699 /*
700 * If this is a task context, we need to check whether it is
701 * the current task context of this cpu. If not it has been
702 * scheduled out before the smp call arrived.
703 * Or possibly this is the right context but it isn't
704 * on this cpu because it had no counters.
705 */
706 if (ctx->task && cpuctx->task_ctx != ctx) {
707 if (cpuctx->task_ctx || ctx->task != current)
708 return;
709 cpuctx->task_ctx = ctx;
710 }
711
712 spin_lock(&ctx->lock);
713 ctx->is_active = 1;
714 update_context_time(ctx);
715
716 /*
717 * Protect the list operation against NMI by disabling the
718 * counters on a global level. NOP for non NMI based counters.
719 */
720 perf_disable();
721
722 add_counter_to_ctx(counter, ctx);
723
724 /*
725 * Don't put the counter on if it is disabled or if
726 * it is in a group and the group isn't on.
727 */
728 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
729 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
730 goto unlock;
731
732 /*
733 * An exclusive counter can't go on if there are already active
734 * hardware counters, and no hardware counter can go on if there
735 * is already an exclusive counter on.
736 */
737 if (!group_can_go_on(counter, cpuctx, 1))
738 err = -EEXIST;
739 else
740 err = counter_sched_in(counter, cpuctx, ctx, cpu);
741
742 if (err) {
743 /*
744 * This counter couldn't go on. If it is in a group
745 * then we have to pull the whole group off.
746 * If the counter group is pinned then put it in error state.
747 */
748 if (leader != counter)
749 group_sched_out(leader, cpuctx, ctx);
750 if (leader->attr.pinned) {
751 update_group_times(leader);
752 leader->state = PERF_COUNTER_STATE_ERROR;
753 }
754 }
755
756 if (!err && !ctx->task && cpuctx->max_pertask)
757 cpuctx->max_pertask--;
758
759 unlock:
760 perf_enable();
761
762 spin_unlock(&ctx->lock);
763}
764
765/*
766 * Attach a performance counter to a context
767 *
768 * First we add the counter to the list with the hardware enable bit
769 * in counter->hw_config cleared.
770 *
771 * If the counter is attached to a task which is on a CPU we use a smp
772 * call to enable it in the task context. The task might have been
773 * scheduled away, but we check this in the smp call again.
774 *
775 * Must be called with ctx->mutex held.
776 */
777static void
778perf_install_in_context(struct perf_counter_context *ctx,
779 struct perf_counter *counter,
780 int cpu)
781{
782 struct task_struct *task = ctx->task;
783
784 if (!task) {
785 /*
786 * Per cpu counters are installed via an smp call and
787 * the install is always sucessful.
788 */
789 smp_call_function_single(cpu, __perf_install_in_context,
790 counter, 1);
791 return;
792 }
793
794retry:
795 task_oncpu_function_call(task, __perf_install_in_context,
796 counter);
797
798 spin_lock_irq(&ctx->lock);
799 /*
800 * we need to retry the smp call.
801 */
802 if (ctx->is_active && list_empty(&counter->list_entry)) {
803 spin_unlock_irq(&ctx->lock);
804 goto retry;
805 }
806
807 /*
808 * The lock prevents that this context is scheduled in so we
809 * can add the counter safely, if it the call above did not
810 * succeed.
811 */
812 if (list_empty(&counter->list_entry))
813 add_counter_to_ctx(counter, ctx);
814 spin_unlock_irq(&ctx->lock);
815}
816
817/*
818 * Cross CPU call to enable a performance counter
819 */
820static void __perf_counter_enable(void *info)
821{
822 struct perf_counter *counter = info;
823 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
824 struct perf_counter_context *ctx = counter->ctx;
825 struct perf_counter *leader = counter->group_leader;
826 int err;
827
828 /*
829 * If this is a per-task counter, need to check whether this
830 * counter's task is the current task on this cpu.
831 */
832 if (ctx->task && cpuctx->task_ctx != ctx) {
833 if (cpuctx->task_ctx || ctx->task != current)
834 return;
835 cpuctx->task_ctx = ctx;
836 }
837
838 spin_lock(&ctx->lock);
839 ctx->is_active = 1;
840 update_context_time(ctx);
841
842 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
843 goto unlock;
844 counter->state = PERF_COUNTER_STATE_INACTIVE;
845 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
846
847 /*
848 * If the counter is in a group and isn't the group leader,
849 * then don't put it on unless the group is on.
850 */
851 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
852 goto unlock;
853
854 if (!group_can_go_on(counter, cpuctx, 1)) {
855 err = -EEXIST;
856 } else {
857 perf_disable();
858 if (counter == leader)
859 err = group_sched_in(counter, cpuctx, ctx,
860 smp_processor_id());
861 else
862 err = counter_sched_in(counter, cpuctx, ctx,
863 smp_processor_id());
864 perf_enable();
865 }
866
867 if (err) {
868 /*
869 * If this counter can't go on and it's part of a
870 * group, then the whole group has to come off.
871 */
872 if (leader != counter)
873 group_sched_out(leader, cpuctx, ctx);
874 if (leader->attr.pinned) {
875 update_group_times(leader);
876 leader->state = PERF_COUNTER_STATE_ERROR;
877 }
878 }
879
880 unlock:
881 spin_unlock(&ctx->lock);
882}
883
884/*
885 * Enable a counter.
886 *
887 * If counter->ctx is a cloned context, callers must make sure that
888 * every task struct that counter->ctx->task could possibly point to
889 * remains valid. This condition is satisfied when called through
890 * perf_counter_for_each_child or perf_counter_for_each as described
891 * for perf_counter_disable.
892 */
893static void perf_counter_enable(struct perf_counter *counter)
894{
895 struct perf_counter_context *ctx = counter->ctx;
896 struct task_struct *task = ctx->task;
897
898 if (!task) {
899 /*
900 * Enable the counter on the cpu that it's on
901 */
902 smp_call_function_single(counter->cpu, __perf_counter_enable,
903 counter, 1);
904 return;
905 }
906
907 spin_lock_irq(&ctx->lock);
908 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
909 goto out;
910
911 /*
912 * If the counter is in error state, clear that first.
913 * That way, if we see the counter in error state below, we
914 * know that it has gone back into error state, as distinct
915 * from the task having been scheduled away before the
916 * cross-call arrived.
917 */
918 if (counter->state == PERF_COUNTER_STATE_ERROR)
919 counter->state = PERF_COUNTER_STATE_OFF;
920
921 retry:
922 spin_unlock_irq(&ctx->lock);
923 task_oncpu_function_call(task, __perf_counter_enable, counter);
924
925 spin_lock_irq(&ctx->lock);
926
927 /*
928 * If the context is active and the counter is still off,
929 * we need to retry the cross-call.
930 */
931 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
932 goto retry;
933
934 /*
935 * Since we have the lock this context can't be scheduled
936 * in, so we can change the state safely.
937 */
938 if (counter->state == PERF_COUNTER_STATE_OFF) {
939 counter->state = PERF_COUNTER_STATE_INACTIVE;
940 counter->tstamp_enabled =
941 ctx->time - counter->total_time_enabled;
942 }
943 out:
944 spin_unlock_irq(&ctx->lock);
945}
946
947static int perf_counter_refresh(struct perf_counter *counter, int refresh)
948{
949 /*
950 * not supported on inherited counters
951 */
952 if (counter->attr.inherit)
953 return -EINVAL;
954
955 atomic_add(refresh, &counter->event_limit);
956 perf_counter_enable(counter);
957
958 return 0;
959}
960
961void __perf_counter_sched_out(struct perf_counter_context *ctx,
962 struct perf_cpu_context *cpuctx)
963{
964 struct perf_counter *counter;
965
966 spin_lock(&ctx->lock);
967 ctx->is_active = 0;
968 if (likely(!ctx->nr_counters))
969 goto out;
970 update_context_time(ctx);
971
972 perf_disable();
973 if (ctx->nr_active) {
974 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
975 if (counter != counter->group_leader)
976 counter_sched_out(counter, cpuctx, ctx);
977 else
978 group_sched_out(counter, cpuctx, ctx);
979 }
980 }
981 perf_enable();
982 out:
983 spin_unlock(&ctx->lock);
984}
985
986/*
987 * Test whether two contexts are equivalent, i.e. whether they
988 * have both been cloned from the same version of the same context
989 * and they both have the same number of enabled counters.
990 * If the number of enabled counters is the same, then the set
991 * of enabled counters should be the same, because these are both
992 * inherited contexts, therefore we can't access individual counters
993 * in them directly with an fd; we can only enable/disable all
994 * counters via prctl, or enable/disable all counters in a family
995 * via ioctl, which will have the same effect on both contexts.
996 */
997static int context_equiv(struct perf_counter_context *ctx1,
998 struct perf_counter_context *ctx2)
999{
1000 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1001 && ctx1->parent_gen == ctx2->parent_gen
1002 && !ctx1->pin_count && !ctx2->pin_count;
1003}
1004
1005/*
1006 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled.
1008 *
1009 * We stop each counter and update the counter value in counter->count.
1010 *
1011 * This does not protect us against NMI, but disable()
1012 * sets the disabled bit in the control field of counter _before_
1013 * accessing the counter control register. If a NMI hits, then it will
1014 * not restart the counter.
1015 */
1016void perf_counter_task_sched_out(struct task_struct *task,
1017 struct task_struct *next, int cpu)
1018{
1019 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1020 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1021 struct perf_counter_context *next_ctx;
1022 struct perf_counter_context *parent;
1023 struct pt_regs *regs;
1024 int do_switch = 1;
1025
1026 regs = task_pt_regs(task);
1027 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1028
1029 if (likely(!ctx || !cpuctx->task_ctx))
1030 return;
1031
1032 update_context_time(ctx);
1033
1034 rcu_read_lock();
1035 parent = rcu_dereference(ctx->parent_ctx);
1036 next_ctx = next->perf_counter_ctxp;
1037 if (parent && next_ctx &&
1038 rcu_dereference(next_ctx->parent_ctx) == parent) {
1039 /*
1040 * Looks like the two contexts are clones, so we might be
1041 * able to optimize the context switch. We lock both
1042 * contexts and check that they are clones under the
1043 * lock (including re-checking that neither has been
1044 * uncloned in the meantime). It doesn't matter which
1045 * order we take the locks because no other cpu could
1046 * be trying to lock both of these tasks.
1047 */
1048 spin_lock(&ctx->lock);
1049 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1050 if (context_equiv(ctx, next_ctx)) {
1051 /*
1052 * XXX do we need a memory barrier of sorts
1053 * wrt to rcu_dereference() of perf_counter_ctxp
1054 */
1055 task->perf_counter_ctxp = next_ctx;
1056 next->perf_counter_ctxp = ctx;
1057 ctx->task = next;
1058 next_ctx->task = task;
1059 do_switch = 0;
1060 }
1061 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock);
1063 }
1064 rcu_read_unlock();
1065
1066 if (do_switch) {
1067 __perf_counter_sched_out(ctx, cpuctx);
1068 cpuctx->task_ctx = NULL;
1069 }
1070}
1071
1072/*
1073 * Called with IRQs disabled
1074 */
1075static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1076{
1077 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1078
1079 if (!cpuctx->task_ctx)
1080 return;
1081
1082 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1083 return;
1084
1085 __perf_counter_sched_out(ctx, cpuctx);
1086 cpuctx->task_ctx = NULL;
1087}
1088
1089/*
1090 * Called with IRQs disabled
1091 */
1092static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1093{
1094 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1095}
1096
1097static void
1098__perf_counter_sched_in(struct perf_counter_context *ctx,
1099 struct perf_cpu_context *cpuctx, int cpu)
1100{
1101 struct perf_counter *counter;
1102 int can_add_hw = 1;
1103
1104 spin_lock(&ctx->lock);
1105 ctx->is_active = 1;
1106 if (likely(!ctx->nr_counters))
1107 goto out;
1108
1109 ctx->timestamp = perf_clock();
1110
1111 perf_disable();
1112
1113 /*
1114 * First go through the list and put on any pinned groups
1115 * in order to give them the best chance of going on.
1116 */
1117 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1118 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1119 !counter->attr.pinned)
1120 continue;
1121 if (counter->cpu != -1 && counter->cpu != cpu)
1122 continue;
1123
1124 if (counter != counter->group_leader)
1125 counter_sched_in(counter, cpuctx, ctx, cpu);
1126 else {
1127 if (group_can_go_on(counter, cpuctx, 1))
1128 group_sched_in(counter, cpuctx, ctx, cpu);
1129 }
1130
1131 /*
1132 * If this pinned group hasn't been scheduled,
1133 * put it in error state.
1134 */
1135 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1136 update_group_times(counter);
1137 counter->state = PERF_COUNTER_STATE_ERROR;
1138 }
1139 }
1140
1141 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1142 /*
1143 * Ignore counters in OFF or ERROR state, and
1144 * ignore pinned counters since we did them already.
1145 */
1146 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1147 counter->attr.pinned)
1148 continue;
1149
1150 /*
1151 * Listen to the 'cpu' scheduling filter constraint
1152 * of counters:
1153 */
1154 if (counter->cpu != -1 && counter->cpu != cpu)
1155 continue;
1156
1157 if (counter != counter->group_leader) {
1158 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1159 can_add_hw = 0;
1160 } else {
1161 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1162 if (group_sched_in(counter, cpuctx, ctx, cpu))
1163 can_add_hw = 0;
1164 }
1165 }
1166 }
1167 perf_enable();
1168 out:
1169 spin_unlock(&ctx->lock);
1170}
1171
1172/*
1173 * Called from scheduler to add the counters of the current task
1174 * with interrupts disabled.
1175 *
1176 * We restore the counter value and then enable it.
1177 *
1178 * This does not protect us against NMI, but enable()
1179 * sets the enabled bit in the control field of counter _before_
1180 * accessing the counter control register. If a NMI hits, then it will
1181 * keep the counter running.
1182 */
1183void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1184{
1185 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1186 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1187
1188 if (likely(!ctx))
1189 return;
1190 if (cpuctx->task_ctx == ctx)
1191 return;
1192 __perf_counter_sched_in(ctx, cpuctx, cpu);
1193 cpuctx->task_ctx = ctx;
1194}
1195
1196static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1197{
1198 struct perf_counter_context *ctx = &cpuctx->ctx;
1199
1200 __perf_counter_sched_in(ctx, cpuctx, cpu);
1201}
1202
1203#define MAX_INTERRUPTS (~0ULL)
1204
1205static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207
1208static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{
1210 struct hw_perf_counter *hwc = &counter->hw;
1211 u64 period, sample_period;
1212 s64 delta;
1213
1214 events *= hwc->sample_period;
1215 period = div64_u64(events, counter->attr.sample_freq);
1216
1217 delta = (s64)(period - hwc->sample_period);
1218 delta = (delta + 7) / 8; /* low pass filter */
1219
1220 sample_period = hwc->sample_period + delta;
1221
1222 if (!sample_period)
1223 sample_period = 1;
1224
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period;
1228}
1229
1230static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1231{
1232 struct perf_counter *counter;
1233 struct hw_perf_counter *hwc;
1234 u64 interrupts, freq;
1235
1236 spin_lock(&ctx->lock);
1237 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1238 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1239 continue;
1240
1241 hwc = &counter->hw;
1242
1243 interrupts = hwc->interrupts;
1244 hwc->interrupts = 0;
1245
1246 /*
1247 * unthrottle counters on the tick
1248 */
1249 if (interrupts == MAX_INTERRUPTS) {
1250 perf_log_throttle(counter, 1);
1251 counter->pmu->unthrottle(counter);
1252 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1253 }
1254
1255 if (!counter->attr.freq || !counter->attr.sample_freq)
1256 continue;
1257
1258 /*
1259 * if the specified freq < HZ then we need to skip ticks
1260 */
1261 if (counter->attr.sample_freq < HZ) {
1262 freq = counter->attr.sample_freq;
1263
1264 hwc->freq_count += freq;
1265 hwc->freq_interrupts += interrupts;
1266
1267 if (hwc->freq_count < HZ)
1268 continue;
1269
1270 interrupts = hwc->freq_interrupts;
1271 hwc->freq_interrupts = 0;
1272 hwc->freq_count -= HZ;
1273 } else
1274 freq = HZ;
1275
1276 perf_adjust_period(counter, freq * interrupts);
1277
1278 /*
1279 * In order to avoid being stalled by an (accidental) huge
1280 * sample period, force reset the sample period if we didn't
1281 * get any events in this freq period.
1282 */
1283 if (!interrupts) {
1284 perf_disable();
1285 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter);
1288 perf_enable();
1289 }
1290 }
1291 spin_unlock(&ctx->lock);
1292}
1293
1294/*
1295 * Round-robin a context's counters:
1296 */
1297static void rotate_ctx(struct perf_counter_context *ctx)
1298{
1299 struct perf_counter *counter;
1300
1301 if (!ctx->nr_counters)
1302 return;
1303
1304 spin_lock(&ctx->lock);
1305 /*
1306 * Rotate the first entry last (works just fine for group counters too):
1307 */
1308 perf_disable();
1309 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1310 list_move_tail(&counter->list_entry, &ctx->counter_list);
1311 break;
1312 }
1313 perf_enable();
1314
1315 spin_unlock(&ctx->lock);
1316}
1317
1318void perf_counter_task_tick(struct task_struct *curr, int cpu)
1319{
1320 struct perf_cpu_context *cpuctx;
1321 struct perf_counter_context *ctx;
1322
1323 if (!atomic_read(&nr_counters))
1324 return;
1325
1326 cpuctx = &per_cpu(perf_cpu_context, cpu);
1327 ctx = curr->perf_counter_ctxp;
1328
1329 perf_ctx_adjust_freq(&cpuctx->ctx);
1330 if (ctx)
1331 perf_ctx_adjust_freq(ctx);
1332
1333 perf_counter_cpu_sched_out(cpuctx);
1334 if (ctx)
1335 __perf_counter_task_sched_out(ctx);
1336
1337 rotate_ctx(&cpuctx->ctx);
1338 if (ctx)
1339 rotate_ctx(ctx);
1340
1341 perf_counter_cpu_sched_in(cpuctx, cpu);
1342 if (ctx)
1343 perf_counter_task_sched_in(curr, cpu);
1344}
1345
1346/*
1347 * Cross CPU call to read the hardware counter
1348 */
1349static void __read(void *info)
1350{
1351 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags;
1354
1355 local_irq_save(flags);
1356 if (ctx->is_active)
1357 update_context_time(ctx);
1358 counter->pmu->read(counter);
1359 update_counter_times(counter);
1360 local_irq_restore(flags);
1361}
1362
1363static u64 perf_counter_read(struct perf_counter *counter)
1364{
1365 /*
1366 * If counter is enabled and currently active on a CPU, update the
1367 * value in the counter structure:
1368 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter);
1374 }
1375
1376 return atomic64_read(&counter->count);
1377}
1378
1379/*
1380 * Initialize the perf_counter context in a task_struct:
1381 */
1382static void
1383__perf_counter_init_context(struct perf_counter_context *ctx,
1384 struct task_struct *task)
1385{
1386 memset(ctx, 0, sizeof(*ctx));
1387 spin_lock_init(&ctx->lock);
1388 mutex_init(&ctx->mutex);
1389 INIT_LIST_HEAD(&ctx->counter_list);
1390 INIT_LIST_HEAD(&ctx->event_list);
1391 atomic_set(&ctx->refcount, 1);
1392 ctx->task = task;
1393}
1394
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task;
1401 unsigned long flags;
1402 int err;
1403
1404 /*
1405 * If cpu is not a wildcard then this is a percpu counter:
1406 */
1407 if (cpu != -1) {
1408 /* Must be root to operate on a CPU counter: */
1409 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1410 return ERR_PTR(-EACCES);
1411
1412 if (cpu < 0 || cpu > num_possible_cpus())
1413 return ERR_PTR(-EINVAL);
1414
1415 /*
1416 * We could be clever and allow to attach a counter to an
1417 * offline CPU and activate it when the CPU comes up, but
1418 * that's for later.
1419 */
1420 if (!cpu_isset(cpu, cpu_online_map))
1421 return ERR_PTR(-ENODEV);
1422
1423 cpuctx = &per_cpu(perf_cpu_context, cpu);
1424 ctx = &cpuctx->ctx;
1425 get_ctx(ctx);
1426
1427 return ctx;
1428 }
1429
1430 rcu_read_lock();
1431 if (!pid)
1432 task = current;
1433 else
1434 task = find_task_by_vpid(pid);
1435 if (task)
1436 get_task_struct(task);
1437 rcu_read_unlock();
1438
1439 if (!task)
1440 return ERR_PTR(-ESRCH);
1441
1442 /*
1443 * Can't attach counters to a dying task.
1444 */
1445 err = -ESRCH;
1446 if (task->flags & PF_EXITING)
1447 goto errout;
1448
1449 /* Reuse ptrace permission checks for now. */
1450 err = -EACCES;
1451 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1452 goto errout;
1453
1454 retry:
1455 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) {
1457 parent_ctx = ctx->parent_ctx;
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags);
1468 }
1469
1470 if (!ctx) {
1471 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1472 err = -ENOMEM;
1473 if (!ctx)
1474 goto errout;
1475 __perf_counter_init_context(ctx, task);
1476 get_ctx(ctx);
1477 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1478 /*
1479 * We raced with some other task; use
1480 * the context they set.
1481 */
1482 kfree(ctx);
1483 goto retry;
1484 }
1485 get_task_struct(task);
1486 }
1487
1488 put_task_struct(task);
1489 return ctx;
1490
1491 errout:
1492 put_task_struct(task);
1493 return ERR_PTR(err);
1494}
1495
1496static void free_counter_rcu(struct rcu_head *head)
1497{
1498 struct perf_counter *counter;
1499
1500 counter = container_of(head, struct perf_counter, rcu_head);
1501 if (counter->ns)
1502 put_pid_ns(counter->ns);
1503 kfree(counter);
1504}
1505
1506static void perf_pending_sync(struct perf_counter *counter);
1507
1508static void free_counter(struct perf_counter *counter)
1509{
1510 perf_pending_sync(counter);
1511
1512 atomic_dec(&nr_counters);
1513 if (counter->attr.mmap)
1514 atomic_dec(&nr_mmap_counters);
1515 if (counter->attr.comm)
1516 atomic_dec(&nr_comm_counters);
1517
1518 if (counter->destroy)
1519 counter->destroy(counter);
1520
1521 put_ctx(counter->ctx);
1522 call_rcu(&counter->rcu_head, free_counter_rcu);
1523}
1524
1525/*
1526 * Called when the last reference to the file is gone.
1527 */
1528static int perf_release(struct inode *inode, struct file *file)
1529{
1530 struct perf_counter *counter = file->private_data;
1531 struct perf_counter_context *ctx = counter->ctx;
1532
1533 file->private_data = NULL;
1534
1535 WARN_ON_ONCE(ctx->parent_ctx);
1536 mutex_lock(&ctx->mutex);
1537 perf_counter_remove_from_context(counter);
1538 mutex_unlock(&ctx->mutex);
1539
1540 mutex_lock(&counter->owner->perf_counter_mutex);
1541 list_del_init(&counter->owner_entry);
1542 mutex_unlock(&counter->owner->perf_counter_mutex);
1543 put_task_struct(counter->owner);
1544
1545 free_counter(counter);
1546
1547 return 0;
1548}
1549
1550/*
1551 * Read the performance counter - simple non blocking version for now
1552 */
1553static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{
1556 u64 values[3];
1557 int n;
1558
1559 /*
1560 * Return end-of-file for a read on a counter that is in
1561 * error state (i.e. because it was pinned but it couldn't be
1562 * scheduled on to the CPU at some point).
1563 */
1564 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0;
1566
1567 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter);
1570 n = 1;
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1572 values[n++] = counter->total_time_enabled +
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex);
1580
1581 if (count < n * sizeof(u64))
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589}
1590
1591static ssize_t
1592perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1593{
1594 struct perf_counter *counter = file->private_data;
1595
1596 return perf_read_hw(counter, buf, count);
1597}
1598
1599static unsigned int perf_poll(struct file *file, poll_table *wait)
1600{
1601 struct perf_counter *counter = file->private_data;
1602 struct perf_mmap_data *data;
1603 unsigned int events = POLL_HUP;
1604
1605 rcu_read_lock();
1606 data = rcu_dereference(counter->data);
1607 if (data)
1608 events = atomic_xchg(&data->poll, 0);
1609 rcu_read_unlock();
1610
1611 poll_wait(file, &counter->waitq, wait);
1612
1613 return events;
1614}
1615
1616static void perf_counter_reset(struct perf_counter *counter)
1617{
1618 (void)perf_counter_read(counter);
1619 atomic64_set(&counter->count, 0);
1620 perf_counter_update_userpage(counter);
1621}
1622
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/*
1640 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block
1642 * in sync_child_counter if it goes to exit, thus satisfying the
1643 * task existence requirements of perf_counter_enable/disable.
1644 */
1645static void perf_counter_for_each_child(struct perf_counter *counter,
1646 void (*func)(struct perf_counter *))
1647{
1648 struct perf_counter *child;
1649
1650 WARN_ON_ONCE(counter->ctx->parent_ctx);
1651 mutex_lock(&counter->child_mutex);
1652 func(counter);
1653 list_for_each_entry(child, &counter->child_list, child_list)
1654 func(child);
1655 mutex_unlock(&counter->child_mutex);
1656}
1657
1658static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *))
1660{
1661 struct perf_counter *child;
1662
1663 WARN_ON_ONCE(counter->ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex);
1665 perf_counter_for_each_sibling(counter, func);
1666 list_for_each_entry(child, &counter->child_list, child_list)
1667 perf_counter_for_each_sibling(child, func);
1668 mutex_unlock(&counter->child_mutex);
1669}
1670
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1672{
1673 struct perf_counter_context *ctx = counter->ctx;
1674 unsigned long size;
1675 int ret = 0;
1676 u64 value;
1677
1678 if (!counter->attr.sample_period)
1679 return -EINVAL;
1680
1681 size = copy_from_user(&value, arg, sizeof(value));
1682 if (size != sizeof(value))
1683 return -EFAULT;
1684
1685 if (!value)
1686 return -EINVAL;
1687
1688 spin_lock_irq(&ctx->lock);
1689 if (counter->attr.freq) {
1690 if (value > sysctl_perf_counter_sample_rate) {
1691 ret = -EINVAL;
1692 goto unlock;
1693 }
1694
1695 counter->attr.sample_freq = value;
1696 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value;
1701 }
1702unlock:
1703 spin_unlock_irq(&ctx->lock);
1704
1705 return ret;
1706}
1707
1708static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1709{
1710 struct perf_counter *counter = file->private_data;
1711 void (*func)(struct perf_counter *);
1712 u32 flags = arg;
1713
1714 switch (cmd) {
1715 case PERF_COUNTER_IOC_ENABLE:
1716 func = perf_counter_enable;
1717 break;
1718 case PERF_COUNTER_IOC_DISABLE:
1719 func = perf_counter_disable;
1720 break;
1721 case PERF_COUNTER_IOC_RESET:
1722 func = perf_counter_reset;
1723 break;
1724
1725 case PERF_COUNTER_IOC_REFRESH:
1726 return perf_counter_refresh(counter, arg);
1727
1728 case PERF_COUNTER_IOC_PERIOD:
1729 return perf_counter_period(counter, (u64 __user *)arg);
1730
1731 default:
1732 return -ENOTTY;
1733 }
1734
1735 if (flags & PERF_IOC_FLAG_GROUP)
1736 perf_counter_for_each(counter, func);
1737 else
1738 perf_counter_for_each_child(counter, func);
1739
1740 return 0;
1741}
1742
1743int perf_counter_task_enable(void)
1744{
1745 struct perf_counter *counter;
1746
1747 mutex_lock(&current->perf_counter_mutex);
1748 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1749 perf_counter_for_each_child(counter, perf_counter_enable);
1750 mutex_unlock(&current->perf_counter_mutex);
1751
1752 return 0;
1753}
1754
1755int perf_counter_task_disable(void)
1756{
1757 struct perf_counter *counter;
1758
1759 mutex_lock(&current->perf_counter_mutex);
1760 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1761 perf_counter_for_each_child(counter, perf_counter_disable);
1762 mutex_unlock(&current->perf_counter_mutex);
1763
1764 return 0;
1765}
1766
1767/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch
1770 * code calls this from NMI context.
1771 */
1772void perf_counter_update_userpage(struct perf_counter *counter)
1773{
1774 struct perf_counter_mmap_page *userpg;
1775 struct perf_mmap_data *data;
1776
1777 rcu_read_lock();
1778 data = rcu_dereference(counter->data);
1779 if (!data)
1780 goto unlock;
1781
1782 userpg = data->user_page;
1783
1784 /*
1785 * Disable preemption so as to not let the corresponding user-space
1786 * spin too long if we get preempted.
1787 */
1788 preempt_disable();
1789 ++userpg->lock;
1790 barrier();
1791 userpg->index = counter->hw.idx;
1792 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795
1796 barrier();
1797 ++userpg->lock;
1798 preempt_enable();
1799unlock:
1800 rcu_read_unlock();
1801}
1802
1803static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1804{
1805 struct perf_counter *counter = vma->vm_file->private_data;
1806 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS;
1808
1809 rcu_read_lock();
1810 data = rcu_dereference(counter->data);
1811 if (!data)
1812 goto unlock;
1813
1814 if (vmf->pgoff == 0) {
1815 vmf->page = virt_to_page(data->user_page);
1816 } else {
1817 int nr = vmf->pgoff - 1;
1818
1819 if ((unsigned)nr > data->nr_pages)
1820 goto unlock;
1821
1822 vmf->page = virt_to_page(data->data_pages[nr]);
1823 }
1824 get_page(vmf->page);
1825 ret = 0;
1826unlock:
1827 rcu_read_unlock();
1828
1829 return ret;
1830}
1831
1832static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1833{
1834 struct perf_mmap_data *data;
1835 unsigned long size;
1836 int i;
1837
1838 WARN_ON(atomic_read(&counter->mmap_count));
1839
1840 size = sizeof(struct perf_mmap_data);
1841 size += nr_pages * sizeof(void *);
1842
1843 data = kzalloc(size, GFP_KERNEL);
1844 if (!data)
1845 goto fail;
1846
1847 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1848 if (!data->user_page)
1849 goto fail_user_page;
1850
1851 for (i = 0; i < nr_pages; i++) {
1852 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1853 if (!data->data_pages[i])
1854 goto fail_data_pages;
1855 }
1856
1857 data->nr_pages = nr_pages;
1858 atomic_set(&data->lock, -1);
1859
1860 rcu_assign_pointer(counter->data, data);
1861
1862 return 0;
1863
1864fail_data_pages:
1865 for (i--; i >= 0; i--)
1866 free_page((unsigned long)data->data_pages[i]);
1867
1868 free_page((unsigned long)data->user_page);
1869
1870fail_user_page:
1871 kfree(data);
1872
1873fail:
1874 return -ENOMEM;
1875}
1876
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{
1879 struct perf_mmap_data *data;
1880 int i;
1881
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883
1884 free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]);
1887 kfree(data);
1888}
1889
1890static void perf_mmap_data_free(struct perf_counter *counter)
1891{
1892 struct perf_mmap_data *data = counter->data;
1893
1894 WARN_ON(atomic_read(&counter->mmap_count));
1895
1896 rcu_assign_pointer(counter->data, NULL);
1897 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1898}
1899
1900static void perf_mmap_open(struct vm_area_struct *vma)
1901{
1902 struct perf_counter *counter = vma->vm_file->private_data;
1903
1904 atomic_inc(&counter->mmap_count);
1905}
1906
1907static void perf_mmap_close(struct vm_area_struct *vma)
1908{
1909 struct perf_counter *counter = vma->vm_file->private_data;
1910
1911 WARN_ON_ONCE(counter->ctx->parent_ctx);
1912 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
1913 struct user_struct *user = current_user();
1914
1915 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1916 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1917 perf_mmap_data_free(counter);
1918 mutex_unlock(&counter->mmap_mutex);
1919 }
1920}
1921
1922static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open,
1924 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault,
1926};
1927
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1929{
1930 struct perf_counter *counter = file->private_data;
1931 unsigned long user_locked, user_lock_limit;
1932 struct user_struct *user = current_user();
1933 unsigned long locked, lock_limit;
1934 unsigned long vma_size;
1935 unsigned long nr_pages;
1936 long user_extra, extra;
1937 int ret = 0;
1938
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1940 return -EINVAL;
1941
1942 vma_size = vma->vm_end - vma->vm_start;
1943 nr_pages = (vma_size / PAGE_SIZE) - 1;
1944
1945 /*
1946 * If we have data pages ensure they're a power-of-two number, so we
1947 * can do bitmasks instead of modulo.
1948 */
1949 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1950 return -EINVAL;
1951
1952 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1953 return -EINVAL;
1954
1955 if (vma->vm_pgoff != 0)
1956 return -EINVAL;
1957
1958 WARN_ON_ONCE(counter->ctx->parent_ctx);
1959 mutex_lock(&counter->mmap_mutex);
1960 if (atomic_inc_not_zero(&counter->mmap_count)) {
1961 if (nr_pages != counter->data->nr_pages)
1962 ret = -EINVAL;
1963 goto unlock;
1964 }
1965
1966 user_extra = nr_pages + 1;
1967 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1968
1969 /*
1970 * Increase the limit linearly with more CPUs:
1971 */
1972 user_lock_limit *= num_online_cpus();
1973
1974 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1975
1976 extra = 0;
1977 if (user_locked > user_lock_limit)
1978 extra = user_locked - user_lock_limit;
1979
1980 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1981 lock_limit >>= PAGE_SHIFT;
1982 locked = vma->vm_mm->locked_vm + extra;
1983
1984 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1985 ret = -EPERM;
1986 goto unlock;
1987 }
1988
1989 WARN_ON(counter->data);
1990 ret = perf_mmap_data_alloc(counter, nr_pages);
1991 if (ret)
1992 goto unlock;
1993
1994 atomic_set(&counter->mmap_count, 1);
1995 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra;
1998unlock:
1999 mutex_unlock(&counter->mmap_mutex);
2000
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops;
2004
2005 return ret;
2006}
2007
2008static int perf_fasync(int fd, struct file *filp, int on)
2009{
2010 struct inode *inode = filp->f_path.dentry->d_inode;
2011 struct perf_counter *counter = filp->private_data;
2012 int retval;
2013
2014 mutex_lock(&inode->i_mutex);
2015 retval = fasync_helper(fd, filp, on, &counter->fasync);
2016 mutex_unlock(&inode->i_mutex);
2017
2018 if (retval < 0)
2019 return retval;
2020
2021 return 0;
2022}
2023
2024static const struct file_operations perf_fops = {
2025 .release = perf_release,
2026 .read = perf_read,
2027 .poll = perf_poll,
2028 .unlocked_ioctl = perf_ioctl,
2029 .compat_ioctl = perf_ioctl,
2030 .mmap = perf_mmap,
2031 .fasync = perf_fasync,
2032};
2033
2034/*
2035 * Perf counter wakeup
2036 *
2037 * If there's data, ensure we set the poll() state and publish everything
2038 * to user-space before waking everybody up.
2039 */
2040
2041void perf_counter_wakeup(struct perf_counter *counter)
2042{
2043 wake_up_all(&counter->waitq);
2044
2045 if (counter->pending_kill) {
2046 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2047 counter->pending_kill = 0;
2048 }
2049}
2050
2051/*
2052 * Pending wakeups
2053 *
2054 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2055 *
2056 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2057 * single linked list and use cmpxchg() to add entries lockless.
2058 */
2059
2060static void perf_pending_counter(struct perf_pending_entry *entry)
2061{
2062 struct perf_counter *counter = container_of(entry,
2063 struct perf_counter, pending);
2064
2065 if (counter->pending_disable) {
2066 counter->pending_disable = 0;
2067 perf_counter_disable(counter);
2068 }
2069
2070 if (counter->pending_wakeup) {
2071 counter->pending_wakeup = 0;
2072 perf_counter_wakeup(counter);
2073 }
2074}
2075
2076#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2077
2078static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2079 PENDING_TAIL,
2080};
2081
2082static void perf_pending_queue(struct perf_pending_entry *entry,
2083 void (*func)(struct perf_pending_entry *))
2084{
2085 struct perf_pending_entry **head;
2086
2087 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2088 return;
2089
2090 entry->func = func;
2091
2092 head = &get_cpu_var(perf_pending_head);
2093
2094 do {
2095 entry->next = *head;
2096 } while (cmpxchg(head, entry->next, entry) != entry->next);
2097
2098 set_perf_counter_pending();
2099
2100 put_cpu_var(perf_pending_head);
2101}
2102
2103static int __perf_pending_run(void)
2104{
2105 struct perf_pending_entry *list;
2106 int nr = 0;
2107
2108 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2109 while (list != PENDING_TAIL) {
2110 void (*func)(struct perf_pending_entry *);
2111 struct perf_pending_entry *entry = list;
2112
2113 list = list->next;
2114
2115 func = entry->func;
2116 entry->next = NULL;
2117 /*
2118 * Ensure we observe the unqueue before we issue the wakeup,
2119 * so that we won't be waiting forever.
2120 * -- see perf_not_pending().
2121 */
2122 smp_wmb();
2123
2124 func(entry);
2125 nr++;
2126 }
2127
2128 return nr;
2129}
2130
2131static inline int perf_not_pending(struct perf_counter *counter)
2132{
2133 /*
2134 * If we flush on whatever cpu we run, there is a chance we don't
2135 * need to wait.
2136 */
2137 get_cpu();
2138 __perf_pending_run();
2139 put_cpu();
2140
2141 /*
2142 * Ensure we see the proper queue state before going to sleep
2143 * so that we do not miss the wakeup. -- see perf_pending_handle()
2144 */
2145 smp_rmb();
2146 return counter->pending.next == NULL;
2147}
2148
2149static void perf_pending_sync(struct perf_counter *counter)
2150{
2151 wait_event(counter->waitq, perf_not_pending(counter));
2152}
2153
2154void perf_counter_do_pending(void)
2155{
2156 __perf_pending_run();
2157}
2158
2159/*
2160 * Callchain support -- arch specific
2161 */
2162
2163__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2164{
2165 return NULL;
2166}
2167
2168/*
2169 * Output
2170 */
2171
2172struct perf_output_handle {
2173 struct perf_counter *counter;
2174 struct perf_mmap_data *data;
2175 unsigned long head;
2176 unsigned long offset;
2177 int nmi;
2178 int overflow;
2179 int locked;
2180 unsigned long flags;
2181};
2182
2183static void perf_output_wakeup(struct perf_output_handle *handle)
2184{
2185 atomic_set(&handle->data->poll, POLL_IN);
2186
2187 if (handle->nmi) {
2188 handle->counter->pending_wakeup = 1;
2189 perf_pending_queue(&handle->counter->pending,
2190 perf_pending_counter);
2191 } else
2192 perf_counter_wakeup(handle->counter);
2193}
2194
2195/*
2196 * Curious locking construct.
2197 *
2198 * We need to ensure a later event doesn't publish a head when a former
2199 * event isn't done writing. However since we need to deal with NMIs we
2200 * cannot fully serialize things.
2201 *
2202 * What we do is serialize between CPUs so we only have to deal with NMI
2203 * nesting on a single CPU.
2204 *
2205 * We only publish the head (and generate a wakeup) when the outer-most
2206 * event completes.
2207 */
2208static void perf_output_lock(struct perf_output_handle *handle)
2209{
2210 struct perf_mmap_data *data = handle->data;
2211 int cpu;
2212
2213 handle->locked = 0;
2214
2215 local_irq_save(handle->flags);
2216 cpu = smp_processor_id();
2217
2218 if (in_nmi() && atomic_read(&data->lock) == cpu)
2219 return;
2220
2221 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2222 cpu_relax();
2223
2224 handle->locked = 1;
2225}
2226
2227static void perf_output_unlock(struct perf_output_handle *handle)
2228{
2229 struct perf_mmap_data *data = handle->data;
2230 unsigned long head;
2231 int cpu;
2232
2233 data->done_head = data->head;
2234
2235 if (!handle->locked)
2236 goto out;
2237
2238again:
2239 /*
2240 * The xchg implies a full barrier that ensures all writes are done
2241 * before we publish the new head, matched by a rmb() in userspace when
2242 * reading this position.
2243 */
2244 while ((head = atomic_long_xchg(&data->done_head, 0)))
2245 data->user_page->data_head = head;
2246
2247 /*
2248 * NMI can happen here, which means we can miss a done_head update.
2249 */
2250
2251 cpu = atomic_xchg(&data->lock, -1);
2252 WARN_ON_ONCE(cpu != smp_processor_id());
2253
2254 /*
2255 * Therefore we have to validate we did not indeed do so.
2256 */
2257 if (unlikely(atomic_long_read(&data->done_head))) {
2258 /*
2259 * Since we had it locked, we can lock it again.
2260 */
2261 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2262 cpu_relax();
2263
2264 goto again;
2265 }
2266
2267 if (atomic_xchg(&data->wakeup, 0))
2268 perf_output_wakeup(handle);
2269out:
2270 local_irq_restore(handle->flags);
2271}
2272
2273static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow)
2276{
2277 struct perf_mmap_data *data;
2278 unsigned int offset, head;
2279
2280 /*
2281 * For inherited counters we send all the output towards the parent.
2282 */
2283 if (counter->parent)
2284 counter = counter->parent;
2285
2286 rcu_read_lock();
2287 data = rcu_dereference(counter->data);
2288 if (!data)
2289 goto out;
2290
2291 handle->data = data;
2292 handle->counter = counter;
2293 handle->nmi = nmi;
2294 handle->overflow = overflow;
2295
2296 if (!data->nr_pages)
2297 goto fail;
2298
2299 perf_output_lock(handle);
2300
2301 do {
2302 offset = head = atomic_long_read(&data->head);
2303 head += size;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305
2306 handle->offset = offset;
2307 handle->head = head;
2308
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1);
2311
2312 return 0;
2313
2314fail:
2315 perf_output_wakeup(handle);
2316out:
2317 rcu_read_unlock();
2318
2319 return -ENOSPC;
2320}
2321
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle)
2362{
2363 struct perf_counter *counter = handle->counter;
2364 struct perf_mmap_data *data = handle->data;
2365
2366 int wakeup_events = counter->attr.wakeup_events;
2367
2368 if (handle->overflow && wakeup_events) {
2369 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events);
2372 atomic_set(&data->wakeup, 1);
2373 }
2374 }
2375
2376 perf_output_unlock(handle);
2377 rcu_read_unlock();
2378}
2379
2380static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2381{
2382 /*
2383 * only top level counters have the pid namespace they were created in
2384 */
2385 if (counter->parent)
2386 counter = counter->parent;
2387
2388 return task_tgid_nr_ns(p, counter->ns);
2389}
2390
2391static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2392{
2393 /*
2394 * only top level counters have the pid namespace they were created in
2395 */
2396 if (counter->parent)
2397 counter = counter->parent;
2398
2399 return task_pid_nr_ns(p, counter->ns);
2400}
2401
2402static void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data)
2404{
2405 int ret;
2406 u64 sample_type = counter->attr.sample_type;
2407 struct perf_output_handle handle;
2408 struct perf_event_header header;
2409 u64 ip;
2410 struct {
2411 u32 pid, tid;
2412 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0;
2419 u64 time;
2420 struct {
2421 u32 cpu, reserved;
2422 } cpu_entry;
2423
2424 header.type = 0;
2425 header.size = sizeof(header);
2426
2427 header.misc = PERF_EVENT_MISC_OVERFLOW;
2428 header.misc |= perf_misc_flags(data->regs);
2429
2430 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip);
2434 }
2435
2436 if (sample_type & PERF_SAMPLE_TID) {
2437 /* namespace issues */
2438 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current);
2440
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry);
2443 }
2444
2445 if (sample_type & PERF_SAMPLE_TIME) {
2446 /*
2447 * Maybe do better on x86 and provide cpu_clock_nmi()
2448 */
2449 time = sched_clock();
2450
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64);
2453 }
2454
2455 if (sample_type & PERF_SAMPLE_ADDR) {
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64);
2458 }
2459
2460 if (sample_type & PERF_SAMPLE_ID) {
2461 header.type |= PERF_SAMPLE_ID;
2462 header.size += sizeof(u64);
2463 }
2464
2465 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry);
2468
2469 cpu_entry.cpu = raw_smp_processor_id();
2470 }
2471
2472 if (sample_type & PERF_SAMPLE_PERIOD) {
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64);
2475 }
2476
2477 if (sample_type & PERF_SAMPLE_GROUP) {
2478 header.type |= PERF_SAMPLE_GROUP;
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs);
2485
2486 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size;
2491 }
2492 }
2493
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2495 if (ret)
2496 return;
2497
2498 perf_output_put(&handle, header);
2499
2500 if (sample_type & PERF_SAMPLE_IP)
2501 perf_output_put(&handle, ip);
2502
2503 if (sample_type & PERF_SAMPLE_TID)
2504 perf_output_put(&handle, tid_entry);
2505
2506 if (sample_type & PERF_SAMPLE_TIME)
2507 perf_output_put(&handle, time);
2508
2509 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr);
2511
2512 if (sample_type & PERF_SAMPLE_ID)
2513 perf_output_put(&handle, counter->id);
2514
2515 if (sample_type & PERF_SAMPLE_CPU)
2516 perf_output_put(&handle, cpu_entry);
2517
2518 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period);
2520
2521 /*
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527
2528 perf_output_put(&handle, nr);
2529
2530 leader = counter->group_leader;
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2532 if (sub != counter)
2533 sub->pmu->read(sub);
2534
2535 group_entry.id = sub->id;
2536 group_entry.counter = atomic64_read(&sub->count);
2537
2538 perf_output_put(&handle, group_entry);
2539 }
2540 }
2541
2542 if (callchain)
2543 perf_output_copy(&handle, callchain, callchain_size);
2544
2545 perf_output_end(&handle);
2546}
2547
2548/*
2549 * fork tracking
2550 */
2551
2552struct perf_fork_event {
2553 struct task_struct *task;
2554
2555 struct {
2556 struct perf_event_header header;
2557
2558 u32 pid;
2559 u32 ppid;
2560 } event;
2561};
2562
2563static void perf_counter_fork_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event)
2565{
2566 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size;
2568 struct task_struct *task = fork_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570
2571 if (ret)
2572 return;
2573
2574 fork_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2576
2577 perf_output_put(&handle, fork_event->event);
2578 perf_output_end(&handle);
2579}
2580
2581static int perf_counter_fork_match(struct perf_counter *counter)
2582{
2583 if (counter->attr.comm || counter->attr.mmap)
2584 return 1;
2585
2586 return 0;
2587}
2588
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event)
2591{
2592 struct perf_counter *counter;
2593
2594 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2595 return;
2596
2597 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter))
2600 perf_counter_fork_output(counter, fork_event);
2601 }
2602 rcu_read_unlock();
2603}
2604
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2606{
2607 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx;
2609
2610 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2612 put_cpu_var(perf_cpu_context);
2613
2614 rcu_read_lock();
2615 /*
2616 * doesn't really matter which of the child contexts the
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event);
2622 rcu_read_unlock();
2623}
2624
2625void perf_counter_fork(struct task_struct *task)
2626{
2627 struct perf_fork_event fork_event;
2628
2629 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters))
2631 return;
2632
2633 fork_event = (struct perf_fork_event){
2634 .task = task,
2635 .event = {
2636 .header = {
2637 .type = PERF_EVENT_FORK,
2638 .size = sizeof(fork_event.event),
2639 },
2640 },
2641 };
2642
2643 perf_counter_fork_event(&fork_event);
2644}
2645
2646/*
2647 * comm tracking
2648 */
2649
2650struct perf_comm_event {
2651 struct task_struct *task;
2652 char *comm;
2653 int comm_size;
2654
2655 struct {
2656 struct perf_event_header header;
2657
2658 u32 pid;
2659 u32 tid;
2660 } event;
2661};
2662
2663static void perf_counter_comm_output(struct perf_counter *counter,
2664 struct perf_comm_event *comm_event)
2665{
2666 struct perf_output_handle handle;
2667 int size = comm_event->event.header.size;
2668 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2669
2670 if (ret)
2671 return;
2672
2673 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2674 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2675
2676 perf_output_put(&handle, comm_event->event);
2677 perf_output_copy(&handle, comm_event->comm,
2678 comm_event->comm_size);
2679 perf_output_end(&handle);
2680}
2681
2682static int perf_counter_comm_match(struct perf_counter *counter)
2683{
2684 if (counter->attr.comm)
2685 return 1;
2686
2687 return 0;
2688}
2689
2690static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2691 struct perf_comm_event *comm_event)
2692{
2693 struct perf_counter *counter;
2694
2695 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2696 return;
2697
2698 rcu_read_lock();
2699 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2700 if (perf_counter_comm_match(counter))
2701 perf_counter_comm_output(counter, comm_event);
2702 }
2703 rcu_read_unlock();
2704}
2705
2706static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2707{
2708 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx;
2710 unsigned int size;
2711 char *comm = comm_event->task->comm;
2712
2713 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714
2715 comm_event->comm = comm;
2716 comm_event->comm_size = size;
2717
2718 comm_event->event.header.size = sizeof(comm_event->event) + size;
2719
2720 cpuctx = &get_cpu_var(perf_cpu_context);
2721 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2722 put_cpu_var(perf_cpu_context);
2723
2724 rcu_read_lock();
2725 /*
2726 * doesn't really matter which of the child contexts the
2727 * events ends up in.
2728 */
2729 ctx = rcu_dereference(current->perf_counter_ctxp);
2730 if (ctx)
2731 perf_counter_comm_ctx(ctx, comm_event);
2732 rcu_read_unlock();
2733}
2734
2735void perf_counter_comm(struct task_struct *task)
2736{
2737 struct perf_comm_event comm_event;
2738
2739 if (!atomic_read(&nr_comm_counters))
2740 return;
2741
2742 comm_event = (struct perf_comm_event){
2743 .task = task,
2744 .event = {
2745 .header = { .type = PERF_EVENT_COMM, },
2746 },
2747 };
2748
2749 perf_counter_comm_event(&comm_event);
2750}
2751
2752/*
2753 * mmap tracking
2754 */
2755
2756struct perf_mmap_event {
2757 struct vm_area_struct *vma;
2758
2759 const char *file_name;
2760 int file_size;
2761
2762 struct {
2763 struct perf_event_header header;
2764
2765 u32 pid;
2766 u32 tid;
2767 u64 start;
2768 u64 len;
2769 u64 pgoff;
2770 } event;
2771};
2772
2773static void perf_counter_mmap_output(struct perf_counter *counter,
2774 struct perf_mmap_event *mmap_event)
2775{
2776 struct perf_output_handle handle;
2777 int size = mmap_event->event.header.size;
2778 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2779
2780 if (ret)
2781 return;
2782
2783 mmap_event->event.pid = perf_counter_pid(counter, current);
2784 mmap_event->event.tid = perf_counter_tid(counter, current);
2785
2786 perf_output_put(&handle, mmap_event->event);
2787 perf_output_copy(&handle, mmap_event->file_name,
2788 mmap_event->file_size);
2789 perf_output_end(&handle);
2790}
2791
2792static int perf_counter_mmap_match(struct perf_counter *counter,
2793 struct perf_mmap_event *mmap_event)
2794{
2795 if (counter->attr.mmap)
2796 return 1;
2797
2798 return 0;
2799}
2800
2801static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2802 struct perf_mmap_event *mmap_event)
2803{
2804 struct perf_counter *counter;
2805
2806 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2807 return;
2808
2809 rcu_read_lock();
2810 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2811 if (perf_counter_mmap_match(counter, mmap_event))
2812 perf_counter_mmap_output(counter, mmap_event);
2813 }
2814 rcu_read_unlock();
2815}
2816
2817static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2818{
2819 struct perf_cpu_context *cpuctx;
2820 struct perf_counter_context *ctx;
2821 struct vm_area_struct *vma = mmap_event->vma;
2822 struct file *file = vma->vm_file;
2823 unsigned int size;
2824 char tmp[16];
2825 char *buf = NULL;
2826 const char *name;
2827
2828 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2830 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name;
2833 }
2834 name = d_path(&file->f_path, buf, PATH_MAX);
2835 if (IS_ERR(name)) {
2836 name = strncpy(tmp, "//toolong", sizeof(tmp));
2837 goto got_name;
2838 }
2839 } else {
2840 name = arch_vma_name(mmap_event->vma);
2841 if (name)
2842 goto got_name;
2843
2844 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp));
2846 goto got_name;
2847 }
2848
2849 name = strncpy(tmp, "//anon", sizeof(tmp));
2850 goto got_name;
2851 }
2852
2853got_name:
2854 size = ALIGN(strlen(name)+1, sizeof(u64));
2855
2856 mmap_event->file_name = name;
2857 mmap_event->file_size = size;
2858
2859 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2860
2861 cpuctx = &get_cpu_var(perf_cpu_context);
2862 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2863 put_cpu_var(perf_cpu_context);
2864
2865 rcu_read_lock();
2866 /*
2867 * doesn't really matter which of the child contexts the
2868 * events ends up in.
2869 */
2870 ctx = rcu_dereference(current->perf_counter_ctxp);
2871 if (ctx)
2872 perf_counter_mmap_ctx(ctx, mmap_event);
2873 rcu_read_unlock();
2874
2875 kfree(buf);
2876}
2877
2878void __perf_counter_mmap(struct vm_area_struct *vma)
2879{
2880 struct perf_mmap_event mmap_event;
2881
2882 if (!atomic_read(&nr_mmap_counters))
2883 return;
2884
2885 mmap_event = (struct perf_mmap_event){
2886 .vma = vma,
2887 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, },
2889 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff,
2892 },
2893 };
2894
2895 perf_counter_mmap_event(&mmap_event);
2896}
2897
2898/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging
2943 */
2944
2945static void perf_log_throttle(struct perf_counter *counter, int enable)
2946{
2947 struct perf_output_handle handle;
2948 int ret;
2949
2950 struct {
2951 struct perf_event_header header;
2952 u64 time;
2953 u64 id;
2954 } throttle_event = {
2955 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1,
2957 .misc = 0,
2958 .size = sizeof(throttle_event),
2959 },
2960 .time = sched_clock(),
2961 .id = counter->id,
2962 };
2963
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret)
2966 return;
2967
2968 perf_output_put(&handle, throttle_event);
2969 perf_output_end(&handle);
2970}
2971
2972/*
2973 * Generic counter overflow handling.
2974 */
2975
2976int perf_counter_overflow(struct perf_counter *counter, int nmi,
2977 struct perf_sample_data *data)
2978{
2979 int events = atomic_read(&counter->event_limit);
2980 int throttle = counter->pmu->unthrottle != NULL;
2981 struct hw_perf_counter *hwc = &counter->hw;
2982 int ret = 0;
2983
2984 if (!throttle) {
2985 hwc->interrupts++;
2986 } else {
2987 if (hwc->interrupts != MAX_INTERRUPTS) {
2988 hwc->interrupts++;
2989 if (HZ * hwc->interrupts >
2990 (u64)sysctl_perf_counter_sample_rate) {
2991 hwc->interrupts = MAX_INTERRUPTS;
2992 perf_log_throttle(counter, 0);
2993 ret = 1;
2994 }
2995 } else {
2996 /*
2997 * Keep re-disabling counters even though on the previous
2998 * pass we disabled it - just in case we raced with a
2999 * sched-in and the counter got enabled again:
3000 */
3001 ret = 1;
3002 }
3003 }
3004
3005 if (counter->attr.freq) {
3006 u64 now = sched_clock();
3007 s64 delta = now - hwc->freq_stamp;
3008
3009 hwc->freq_stamp = now;
3010
3011 if (delta > 0 && delta < TICK_NSEC)
3012 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3013 }
3014
3015 /*
3016 * XXX event_limit might not quite work as expected on inherited
3017 * counters
3018 */
3019
3020 counter->pending_kill = POLL_IN;
3021 if (events && atomic_dec_and_test(&counter->event_limit)) {
3022 ret = 1;
3023 counter->pending_kill = POLL_HUP;
3024 if (nmi) {
3025 counter->pending_disable = 1;
3026 perf_pending_queue(&counter->pending,
3027 perf_pending_counter);
3028 } else
3029 perf_counter_disable(counter);
3030 }
3031
3032 perf_counter_output(counter, nmi, data);
3033 return ret;
3034}
3035
3036/*
3037 * Generic software counter infrastructure
3038 */
3039
3040static void perf_swcounter_update(struct perf_counter *counter)
3041{
3042 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now;
3044 s64 delta;
3045
3046again:
3047 prev = atomic64_read(&hwc->prev_count);
3048 now = atomic64_read(&hwc->count);
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3050 goto again;
3051
3052 delta = now - prev;
3053
3054 atomic64_add(delta, &counter->count);
3055 atomic64_sub(delta, &hwc->period_left);
3056}
3057
3058static void perf_swcounter_set_period(struct perf_counter *counter)
3059{
3060 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left);
3062 s64 period = hwc->sample_period;
3063
3064 if (unlikely(left <= -period)) {
3065 left = period;
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069
3070 if (unlikely(left <= 0)) {
3071 left += period;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075
3076 atomic64_set(&hwc->prev_count, -left);
3077 atomic64_set(&hwc->count, -left);
3078}
3079
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3081{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt
3094 * context, provide the next best thing, the user IP.
3095 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109}
3110
3111static void perf_swcounter_overflow(struct perf_counter *counter,
3112 int nmi, struct pt_regs *regs, u64 addr)
3113{
3114 struct perf_sample_data data = {
3115 .regs = regs,
3116 .addr = addr,
3117 .period = counter->hw.last_period,
3118 };
3119
3120 perf_swcounter_update(counter);
3121 perf_swcounter_set_period(counter);
3122 if (perf_counter_overflow(counter, nmi, &data))
3123 /* soft-disable the counter */
3124 ;
3125
3126}
3127
3128static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{
3130 struct perf_counter_context *ctx;
3131 unsigned long flags;
3132 int count;
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1;
3136
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0;
3139
3140 /*
3141 * If the counter is inactive, it could be just because
3142 * its task is scheduled out, or because it's in a group
3143 * which could not go on the PMU. We want to count in
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */
3154 ctx = counter->ctx;
3155 spin_lock_irqsave(&ctx->lock, flags);
3156 count = 1;
3157 /* Re-check state now we have the lock */
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3159 counter->ctx->is_active ||
3160 counter->tstamp_stopped < ctx->time)
3161 count = 0;
3162 spin_unlock_irqrestore(&ctx->lock, flags);
3163 return count;
3164}
3165
3166static int perf_swcounter_match(struct perf_counter *counter,
3167 enum perf_type_id type,
3168 u32 event, struct pt_regs *regs)
3169{
3170 if (!perf_swcounter_is_counting(counter))
3171 return 0;
3172
3173 if (counter->attr.type != type)
3174 return 0;
3175 if (counter->attr.config != event)
3176 return 0;
3177
3178 if (regs) {
3179 if (counter->attr.exclude_user && user_mode(regs))
3180 return 0;
3181
3182 if (counter->attr.exclude_kernel && !user_mode(regs))
3183 return 0;
3184 }
3185
3186 return 1;
3187}
3188
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event,
3200 u64 nr, int nmi, struct pt_regs *regs,
3201 u64 addr)
3202{
3203 struct perf_counter *counter;
3204
3205 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3206 return;
3207
3208 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr);
3212 }
3213 rcu_read_unlock();
3214}
3215
3216static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3217{
3218 if (in_nmi())
3219 return &cpuctx->recursion[3];
3220
3221 if (in_irq())
3222 return &cpuctx->recursion[2];
3223
3224 if (in_softirq())
3225 return &cpuctx->recursion[1];
3226
3227 return &cpuctx->recursion[0];
3228}
3229
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs,
3232 u64 addr)
3233{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx);
3236 struct perf_counter_context *ctx;
3237
3238 if (*recursion)
3239 goto out;
3240
3241 (*recursion)++;
3242 barrier();
3243
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr);
3246 rcu_read_lock();
3247 /*
3248 * doesn't really matter which of the child contexts the
3249 * events ends up in.
3250 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
3254 rcu_read_unlock();
3255
3256 barrier();
3257 (*recursion)--;
3258
3259out:
3260 put_cpu_var(perf_cpu_context);
3261}
3262
3263void
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3265{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
3267}
3268
3269static void perf_swcounter_read(struct perf_counter *counter)
3270{
3271 perf_swcounter_update(counter);
3272}
3273
3274static int perf_swcounter_enable(struct perf_counter *counter)
3275{
3276 perf_swcounter_set_period(counter);
3277 return 0;
3278}
3279
3280static void perf_swcounter_disable(struct perf_counter *counter)
3281{
3282 perf_swcounter_update(counter);
3283}
3284
3285static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read,
3289};
3290
3291/*
3292 * Software counter: cpu wall time clock
3293 */
3294
3295static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3296{
3297 int cpu = raw_smp_processor_id();
3298 s64 prev;
3299 u64 now;
3300
3301 now = cpu_clock(cpu);
3302 prev = atomic64_read(&counter->hw.prev_count);
3303 atomic64_set(&counter->hw.prev_count, now);
3304 atomic64_add(now - prev, &counter->count);
3305}
3306
3307static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3308{
3309 struct hw_perf_counter *hwc = &counter->hw;
3310 int cpu = raw_smp_processor_id();
3311
3312 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3313 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3314 hwc->hrtimer.function = perf_swcounter_hrtimer;
3315 if (hwc->sample_period) {
3316 u64 period = max_t(u64, 10000, hwc->sample_period);
3317 __hrtimer_start_range_ns(&hwc->hrtimer,
3318 ns_to_ktime(period), 0,
3319 HRTIMER_MODE_REL, 0);
3320 }
3321
3322 return 0;
3323}
3324
3325static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3326{
3327 if (counter->hw.sample_period)
3328 hrtimer_cancel(&counter->hw.hrtimer);
3329 cpu_clock_perf_counter_update(counter);
3330}
3331
3332static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3333{
3334 cpu_clock_perf_counter_update(counter);
3335}
3336
3337static const struct pmu perf_ops_cpu_clock = {
3338 .enable = cpu_clock_perf_counter_enable,
3339 .disable = cpu_clock_perf_counter_disable,
3340 .read = cpu_clock_perf_counter_read,
3341};
3342
3343/*
3344 * Software counter: task time clock
3345 */
3346
3347static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3348{
3349 u64 prev;
3350 s64 delta;
3351
3352 prev = atomic64_xchg(&counter->hw.prev_count, now);
3353 delta = now - prev;
3354 atomic64_add(delta, &counter->count);
3355}
3356
3357static int task_clock_perf_counter_enable(struct perf_counter *counter)
3358{
3359 struct hw_perf_counter *hwc = &counter->hw;
3360 u64 now;
3361
3362 now = counter->ctx->time;
3363
3364 atomic64_set(&hwc->prev_count, now);
3365 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3366 hwc->hrtimer.function = perf_swcounter_hrtimer;
3367 if (hwc->sample_period) {
3368 u64 period = max_t(u64, 10000, hwc->sample_period);
3369 __hrtimer_start_range_ns(&hwc->hrtimer,
3370 ns_to_ktime(period), 0,
3371 HRTIMER_MODE_REL, 0);
3372 }
3373
3374 return 0;
3375}
3376
3377static void task_clock_perf_counter_disable(struct perf_counter *counter)
3378{
3379 if (counter->hw.sample_period)
3380 hrtimer_cancel(&counter->hw.hrtimer);
3381 task_clock_perf_counter_update(counter, counter->ctx->time);
3382
3383}
3384
3385static void task_clock_perf_counter_read(struct perf_counter *counter)
3386{
3387 u64 time;
3388
3389 if (!in_nmi()) {
3390 update_context_time(counter->ctx);
3391 time = counter->ctx->time;
3392 } else {
3393 u64 now = perf_clock();
3394 u64 delta = now - counter->ctx->timestamp;
3395 time = counter->ctx->time + delta;
3396 }
3397
3398 task_clock_perf_counter_update(counter, time);
3399}
3400
3401static const struct pmu perf_ops_task_clock = {
3402 .enable = task_clock_perf_counter_enable,
3403 .disable = task_clock_perf_counter_disable,
3404 .read = task_clock_perf_counter_read,
3405};
3406
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id)
3430{
3431 struct pt_regs *regs = get_irq_regs();
3432
3433 if (!regs)
3434 regs = task_pt_regs(current);
3435
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
3437}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439
3440extern int ftrace_profile_enable(int);
3441extern void ftrace_profile_disable(int);
3442
3443static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{
3445 ftrace_profile_disable(perf_event_id(&counter->attr));
3446}
3447
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{
3450 int event_id = perf_event_id(&counter->attr);
3451 int ret;
3452
3453 ret = ftrace_profile_enable(event_id);
3454 if (ret)
3455 return NULL;
3456
3457 counter->destroy = tp_perf_counter_destroy;
3458
3459 return &perf_ops_generic;
3460}
3461#else
3462static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3463{
3464 return NULL;
3465}
3466#endif
3467
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{
3470 const struct pmu *pmu = NULL;
3471
3472 /*
3473 * Software counters (currently) can't in general distinguish
3474 * between user, kernel and hypervisor events.
3475 * However, context switches and cpu migrations are considered
3476 * to be kernel events, and page faults are never hypervisor
3477 * events.
3478 */
3479 switch (counter->attr.config) {
3480 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock;
3482
3483 break;
3484 case PERF_COUNT_SW_TASK_CLOCK:
3485 /*
3486 * If the user instantiates this as a per-cpu counter,
3487 * use the cpu_clock counter instead.
3488 */
3489 if (counter->ctx->task)
3490 pmu = &perf_ops_task_clock;
3491 else
3492 pmu = &perf_ops_cpu_clock;
3493
3494 break;
3495 case PERF_COUNT_SW_PAGE_FAULTS:
3496 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS:
3500 pmu = &perf_ops_generic;
3501 break;
3502 }
3503
3504 return pmu;
3505}
3506
3507/*
3508 * Allocate and initialize a counter structure
3509 */
3510static struct perf_counter *
3511perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu,
3513 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader,
3515 gfp_t gfpflags)
3516{
3517 const struct pmu *pmu;
3518 struct perf_counter *counter;
3519 struct hw_perf_counter *hwc;
3520 long err;
3521
3522 counter = kzalloc(sizeof(*counter), gfpflags);
3523 if (!counter)
3524 return ERR_PTR(-ENOMEM);
3525
3526 /*
3527 * Single counters are their own group leaders, with an
3528 * empty sibling list:
3529 */
3530 if (!group_leader)
3531 group_leader = counter;
3532
3533 mutex_init(&counter->child_mutex);
3534 INIT_LIST_HEAD(&counter->child_list);
3535
3536 INIT_LIST_HEAD(&counter->list_entry);
3537 INIT_LIST_HEAD(&counter->event_entry);
3538 INIT_LIST_HEAD(&counter->sibling_list);
3539 init_waitqueue_head(&counter->waitq);
3540
3541 mutex_init(&counter->mmap_mutex);
3542
3543 counter->cpu = cpu;
3544 counter->attr = *attr;
3545 counter->group_leader = group_leader;
3546 counter->pmu = NULL;
3547 counter->ctx = ctx;
3548 counter->oncpu = -1;
3549
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id);
3552
3553 counter->state = PERF_COUNTER_STATE_INACTIVE;
3554
3555 if (attr->disabled)
3556 counter->state = PERF_COUNTER_STATE_OFF;
3557
3558 pmu = NULL;
3559
3560 hwc = &counter->hw;
3561 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1;
3564
3565 atomic64_set(&hwc->period_left, hwc->sample_period);
3566
3567 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3569 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3571 goto done;
3572
3573 switch (attr->type) {
3574 case PERF_TYPE_RAW:
3575 case PERF_TYPE_HARDWARE:
3576 case PERF_TYPE_HW_CACHE:
3577 pmu = hw_perf_counter_init(counter);
3578 break;
3579
3580 case PERF_TYPE_SOFTWARE:
3581 pmu = sw_perf_counter_init(counter);
3582 break;
3583
3584 case PERF_TYPE_TRACEPOINT:
3585 pmu = tp_perf_counter_init(counter);
3586 break;
3587
3588 default:
3589 break;
3590 }
3591done:
3592 err = 0;
3593 if (!pmu)
3594 err = -EINVAL;
3595 else if (IS_ERR(pmu))
3596 err = PTR_ERR(pmu);
3597
3598 if (err) {
3599 if (counter->ns)
3600 put_pid_ns(counter->ns);
3601 kfree(counter);
3602 return ERR_PTR(err);
3603 }
3604
3605 counter->pmu = pmu;
3606
3607 atomic_inc(&nr_counters);
3608 if (counter->attr.mmap)
3609 atomic_inc(&nr_mmap_counters);
3610 if (counter->attr.comm)
3611 atomic_inc(&nr_comm_counters);
3612
3613 return counter;
3614}
3615
3616static int perf_copy_attr(struct perf_counter_attr __user *uattr,
3617 struct perf_counter_attr *attr)
3618{
3619 int ret;
3620 u32 size;
3621
3622 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
3623 return -EFAULT;
3624
3625 /*
3626 * zero the full structure, so that a short copy will be nice.
3627 */
3628 memset(attr, 0, sizeof(*attr));
3629
3630 ret = get_user(size, &uattr->size);
3631 if (ret)
3632 return ret;
3633
3634 if (size > PAGE_SIZE) /* silly large */
3635 goto err_size;
3636
3637 if (!size) /* abi compat */
3638 size = PERF_ATTR_SIZE_VER0;
3639
3640 if (size < PERF_ATTR_SIZE_VER0)
3641 goto err_size;
3642
3643 /*
3644 * If we're handed a bigger struct than we know of,
3645 * ensure all the unknown bits are 0.
3646 */
3647 if (size > sizeof(*attr)) {
3648 unsigned long val;
3649 unsigned long __user *addr;
3650 unsigned long __user *end;
3651
3652 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
3653 sizeof(unsigned long));
3654 end = PTR_ALIGN((void __user *)uattr + size,
3655 sizeof(unsigned long));
3656
3657 for (; addr < end; addr += sizeof(unsigned long)) {
3658 ret = get_user(val, addr);
3659 if (ret)
3660 return ret;
3661 if (val)
3662 goto err_size;
3663 }
3664 }
3665
3666 ret = copy_from_user(attr, uattr, size);
3667 if (ret)
3668 return -EFAULT;
3669
3670 /*
3671 * If the type exists, the corresponding creation will verify
3672 * the attr->config.
3673 */
3674 if (attr->type >= PERF_TYPE_MAX)
3675 return -EINVAL;
3676
3677 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
3678 return -EINVAL;
3679
3680 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
3681 return -EINVAL;
3682
3683 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
3684 return -EINVAL;
3685
3686out:
3687 return ret;
3688
3689err_size:
3690 put_user(sizeof(*attr), &uattr->size);
3691 ret = -E2BIG;
3692 goto out;
3693}
3694
3695/**
3696 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3697 *
3698 * @attr_uptr: event type attributes for monitoring/sampling
3699 * @pid: target pid
3700 * @cpu: target cpu
3701 * @group_fd: group leader counter fd
3702 */
3703SYSCALL_DEFINE5(perf_counter_open,
3704 struct perf_counter_attr __user *, attr_uptr,
3705 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3706{
3707 struct perf_counter *counter, *group_leader;
3708 struct perf_counter_attr attr;
3709 struct perf_counter_context *ctx;
3710 struct file *counter_file = NULL;
3711 struct file *group_file = NULL;
3712 int fput_needed = 0;
3713 int fput_needed2 = 0;
3714 int ret;
3715
3716 /* for future expandability... */
3717 if (flags)
3718 return -EINVAL;
3719
3720 ret = perf_copy_attr(attr_uptr, &attr);
3721 if (ret)
3722 return ret;
3723
3724 if (!attr.exclude_kernel) {
3725 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
3726 return -EACCES;
3727 }
3728
3729 if (attr.freq) {
3730 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
3731 return -EINVAL;
3732 }
3733
3734 /*
3735 * Get the target context (task or percpu):
3736 */
3737 ctx = find_get_context(pid, cpu);
3738 if (IS_ERR(ctx))
3739 return PTR_ERR(ctx);
3740
3741 /*
3742 * Look up the group leader (we will attach this counter to it):
3743 */
3744 group_leader = NULL;
3745 if (group_fd != -1) {
3746 ret = -EINVAL;
3747 group_file = fget_light(group_fd, &fput_needed);
3748 if (!group_file)
3749 goto err_put_context;
3750 if (group_file->f_op != &perf_fops)
3751 goto err_put_context;
3752
3753 group_leader = group_file->private_data;
3754 /*
3755 * Do not allow a recursive hierarchy (this new sibling
3756 * becoming part of another group-sibling):
3757 */
3758 if (group_leader->group_leader != group_leader)
3759 goto err_put_context;
3760 /*
3761 * Do not allow to attach to a group in a different
3762 * task or CPU context:
3763 */
3764 if (group_leader->ctx != ctx)
3765 goto err_put_context;
3766 /*
3767 * Only a group leader can be exclusive or pinned
3768 */
3769 if (attr.exclusive || attr.pinned)
3770 goto err_put_context;
3771 }
3772
3773 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3774 GFP_KERNEL);
3775 ret = PTR_ERR(counter);
3776 if (IS_ERR(counter))
3777 goto err_put_context;
3778
3779 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3780 if (ret < 0)
3781 goto err_free_put_context;
3782
3783 counter_file = fget_light(ret, &fput_needed2);
3784 if (!counter_file)
3785 goto err_free_put_context;
3786
3787 counter->filp = counter_file;
3788 WARN_ON_ONCE(ctx->parent_ctx);
3789 mutex_lock(&ctx->mutex);
3790 perf_install_in_context(ctx, counter, cpu);
3791 ++ctx->generation;
3792 mutex_unlock(&ctx->mutex);
3793
3794 counter->owner = current;
3795 get_task_struct(current);
3796 mutex_lock(&current->perf_counter_mutex);
3797 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3798 mutex_unlock(&current->perf_counter_mutex);
3799
3800 fput_light(counter_file, fput_needed2);
3801
3802out_fput:
3803 fput_light(group_file, fput_needed);
3804
3805 return ret;
3806
3807err_free_put_context:
3808 kfree(counter);
3809
3810err_put_context:
3811 put_ctx(ctx);
3812
3813 goto out_fput;
3814}
3815
3816/*
3817 * inherit a counter from parent task to child task:
3818 */
3819static struct perf_counter *
3820inherit_counter(struct perf_counter *parent_counter,
3821 struct task_struct *parent,
3822 struct perf_counter_context *parent_ctx,
3823 struct task_struct *child,
3824 struct perf_counter *group_leader,
3825 struct perf_counter_context *child_ctx)
3826{
3827 struct perf_counter *child_counter;
3828
3829 /*
3830 * Instead of creating recursive hierarchies of counters,
3831 * we link inherited counters back to the original parent,
3832 * which has a filp for sure, which we use as the reference
3833 * count:
3834 */
3835 if (parent_counter->parent)
3836 parent_counter = parent_counter->parent;
3837
3838 child_counter = perf_counter_alloc(&parent_counter->attr,
3839 parent_counter->cpu, child_ctx,
3840 group_leader, GFP_KERNEL);
3841 if (IS_ERR(child_counter))
3842 return child_counter;
3843 get_ctx(child_ctx);
3844
3845 /*
3846 * Make the child state follow the state of the parent counter,
3847 * not its attr.disabled bit. We hold the parent's mutex,
3848 * so we won't race with perf_counter_{en, dis}able_family.
3849 */
3850 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3851 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3852 else
3853 child_counter->state = PERF_COUNTER_STATE_OFF;
3854
3855 if (parent_counter->attr.freq)
3856 child_counter->hw.sample_period = parent_counter->hw.sample_period;
3857
3858 /*
3859 * Link it up in the child's context:
3860 */
3861 add_counter_to_ctx(child_counter, child_ctx);
3862
3863 child_counter->parent = parent_counter;
3864 /*
3865 * inherit into child's child as well:
3866 */
3867 child_counter->attr.inherit = 1;
3868
3869 /*
3870 * Get a reference to the parent filp - we will fput it
3871 * when the child counter exits. This is safe to do because
3872 * we are in the parent and we know that the filp still
3873 * exists and has a nonzero count:
3874 */
3875 atomic_long_inc(&parent_counter->filp->f_count);
3876
3877 /*
3878 * Link this into the parent counter's child list
3879 */
3880 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3881 mutex_lock(&parent_counter->child_mutex);
3882 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3883 mutex_unlock(&parent_counter->child_mutex);
3884
3885 return child_counter;
3886}
3887
3888static int inherit_group(struct perf_counter *parent_counter,
3889 struct task_struct *parent,
3890 struct perf_counter_context *parent_ctx,
3891 struct task_struct *child,
3892 struct perf_counter_context *child_ctx)
3893{
3894 struct perf_counter *leader;
3895 struct perf_counter *sub;
3896 struct perf_counter *child_ctr;
3897
3898 leader = inherit_counter(parent_counter, parent, parent_ctx,
3899 child, NULL, child_ctx);
3900 if (IS_ERR(leader))
3901 return PTR_ERR(leader);
3902 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3903 child_ctr = inherit_counter(sub, parent, parent_ctx,
3904 child, leader, child_ctx);
3905 if (IS_ERR(child_ctr))
3906 return PTR_ERR(child_ctr);
3907 }
3908 return 0;
3909}
3910
3911static void sync_child_counter(struct perf_counter *child_counter,
3912 struct perf_counter *parent_counter)
3913{
3914 u64 child_val;
3915
3916 child_val = atomic64_read(&child_counter->count);
3917
3918 /*
3919 * Add back the child's count to the parent's count:
3920 */
3921 atomic64_add(child_val, &parent_counter->count);
3922 atomic64_add(child_counter->total_time_enabled,
3923 &parent_counter->child_total_time_enabled);
3924 atomic64_add(child_counter->total_time_running,
3925 &parent_counter->child_total_time_running);
3926
3927 /*
3928 * Remove this counter from the parent's list
3929 */
3930 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3931 mutex_lock(&parent_counter->child_mutex);
3932 list_del_init(&child_counter->child_list);
3933 mutex_unlock(&parent_counter->child_mutex);
3934
3935 /*
3936 * Release the parent counter, if this was the last
3937 * reference to it.
3938 */
3939 fput(parent_counter->filp);
3940}
3941
3942static void
3943__perf_counter_exit_task(struct perf_counter *child_counter,
3944 struct perf_counter_context *child_ctx)
3945{
3946 struct perf_counter *parent_counter;
3947
3948 update_counter_times(child_counter);
3949 perf_counter_remove_from_context(child_counter);
3950
3951 parent_counter = child_counter->parent;
3952 /*
3953 * It can happen that parent exits first, and has counters
3954 * that are still around due to the child reference. These
3955 * counters need to be zapped - but otherwise linger.
3956 */
3957 if (parent_counter) {
3958 sync_child_counter(child_counter, parent_counter);
3959 free_counter(child_counter);
3960 }
3961}
3962
3963/*
3964 * When a child task exits, feed back counter values to parent counters.
3965 */
3966void perf_counter_exit_task(struct task_struct *child)
3967{
3968 struct perf_counter *child_counter, *tmp;
3969 struct perf_counter_context *child_ctx;
3970 unsigned long flags;
3971
3972 if (likely(!child->perf_counter_ctxp))
3973 return;
3974
3975 local_irq_save(flags);
3976 /*
3977 * We can't reschedule here because interrupts are disabled,
3978 * and either child is current or it is a task that can't be
3979 * scheduled, so we are now safe from rescheduling changing
3980 * our context.
3981 */
3982 child_ctx = child->perf_counter_ctxp;
3983 __perf_counter_task_sched_out(child_ctx);
3984
3985 /*
3986 * Take the context lock here so that if find_get_context is
3987 * reading child->perf_counter_ctxp, we wait until it has
3988 * incremented the context's refcount before we do put_ctx below.
3989 */
3990 spin_lock(&child_ctx->lock);
3991 child->perf_counter_ctxp = NULL;
3992 if (child_ctx->parent_ctx) {
3993 /*
3994 * This context is a clone; unclone it so it can't get
3995 * swapped to another process while we're removing all
3996 * the counters from it.
3997 */
3998 put_ctx(child_ctx->parent_ctx);
3999 child_ctx->parent_ctx = NULL;
4000 }
4001 spin_unlock(&child_ctx->lock);
4002 local_irq_restore(flags);
4003
4004 /*
4005 * We can recurse on the same lock type through:
4006 *
4007 * __perf_counter_exit_task()
4008 * sync_child_counter()
4009 * fput(parent_counter->filp)
4010 * perf_release()
4011 * mutex_lock(&ctx->mutex)
4012 *
4013 * But since its the parent context it won't be the same instance.
4014 */
4015 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4016
4017again:
4018 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4019 list_entry)
4020 __perf_counter_exit_task(child_counter, child_ctx);
4021
4022 /*
4023 * If the last counter was a group counter, it will have appended all
4024 * its siblings to the list, but we obtained 'tmp' before that which
4025 * will still point to the list head terminating the iteration.
4026 */
4027 if (!list_empty(&child_ctx->counter_list))
4028 goto again;
4029
4030 mutex_unlock(&child_ctx->mutex);
4031
4032 put_ctx(child_ctx);
4033}
4034
4035/*
4036 * free an unexposed, unused context as created by inheritance by
4037 * init_task below, used by fork() in case of fail.
4038 */
4039void perf_counter_free_task(struct task_struct *task)
4040{
4041 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4042 struct perf_counter *counter, *tmp;
4043
4044 if (!ctx)
4045 return;
4046
4047 mutex_lock(&ctx->mutex);
4048again:
4049 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4050 struct perf_counter *parent = counter->parent;
4051
4052 if (WARN_ON_ONCE(!parent))
4053 continue;
4054
4055 mutex_lock(&parent->child_mutex);
4056 list_del_init(&counter->child_list);
4057 mutex_unlock(&parent->child_mutex);
4058
4059 fput(parent->filp);
4060
4061 list_del_counter(counter, ctx);
4062 free_counter(counter);
4063 }
4064
4065 if (!list_empty(&ctx->counter_list))
4066 goto again;
4067
4068 mutex_unlock(&ctx->mutex);
4069
4070 put_ctx(ctx);
4071}
4072
4073/*
4074 * Initialize the perf_counter context in task_struct
4075 */
4076int perf_counter_init_task(struct task_struct *child)
4077{
4078 struct perf_counter_context *child_ctx, *parent_ctx;
4079 struct perf_counter_context *cloned_ctx;
4080 struct perf_counter *counter;
4081 struct task_struct *parent = current;
4082 int inherited_all = 1;
4083 int ret = 0;
4084
4085 child->perf_counter_ctxp = NULL;
4086
4087 mutex_init(&child->perf_counter_mutex);
4088 INIT_LIST_HEAD(&child->perf_counter_list);
4089
4090 if (likely(!parent->perf_counter_ctxp))
4091 return 0;
4092
4093 /*
4094 * This is executed from the parent task context, so inherit
4095 * counters that have been marked for cloning.
4096 * First allocate and initialize a context for the child.
4097 */
4098
4099 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4100 if (!child_ctx)
4101 return -ENOMEM;
4102
4103 __perf_counter_init_context(child_ctx, child);
4104 child->perf_counter_ctxp = child_ctx;
4105 get_task_struct(child);
4106
4107 /*
4108 * If the parent's context is a clone, pin it so it won't get
4109 * swapped under us.
4110 */
4111 parent_ctx = perf_pin_task_context(parent);
4112
4113 /*
4114 * No need to check if parent_ctx != NULL here; since we saw
4115 * it non-NULL earlier, the only reason for it to become NULL
4116 * is if we exit, and since we're currently in the middle of
4117 * a fork we can't be exiting at the same time.
4118 */
4119
4120 /*
4121 * Lock the parent list. No need to lock the child - not PID
4122 * hashed yet and not running, so nobody can access it.
4123 */
4124 mutex_lock(&parent_ctx->mutex);
4125
4126 /*
4127 * We dont have to disable NMIs - we are only looking at
4128 * the list, not manipulating it:
4129 */
4130 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4131 if (counter != counter->group_leader)
4132 continue;
4133
4134 if (!counter->attr.inherit) {
4135 inherited_all = 0;
4136 continue;
4137 }
4138
4139 ret = inherit_group(counter, parent, parent_ctx,
4140 child, child_ctx);
4141 if (ret) {
4142 inherited_all = 0;
4143 break;
4144 }
4145 }
4146
4147 if (inherited_all) {
4148 /*
4149 * Mark the child context as a clone of the parent
4150 * context, or of whatever the parent is a clone of.
4151 * Note that if the parent is a clone, it could get
4152 * uncloned at any point, but that doesn't matter
4153 * because the list of counters and the generation
4154 * count can't have changed since we took the mutex.
4155 */
4156 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4157 if (cloned_ctx) {
4158 child_ctx->parent_ctx = cloned_ctx;
4159 child_ctx->parent_gen = parent_ctx->parent_gen;
4160 } else {
4161 child_ctx->parent_ctx = parent_ctx;
4162 child_ctx->parent_gen = parent_ctx->generation;
4163 }
4164 get_ctx(child_ctx->parent_ctx);
4165 }
4166
4167 mutex_unlock(&parent_ctx->mutex);
4168
4169 perf_unpin_context(parent_ctx);
4170
4171 return ret;
4172}
4173
4174static void __cpuinit perf_counter_init_cpu(int cpu)
4175{
4176 struct perf_cpu_context *cpuctx;
4177
4178 cpuctx = &per_cpu(perf_cpu_context, cpu);
4179 __perf_counter_init_context(&cpuctx->ctx, NULL);
4180
4181 spin_lock(&perf_resource_lock);
4182 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4183 spin_unlock(&perf_resource_lock);
4184
4185 hw_perf_counter_setup(cpu);
4186}
4187
4188#ifdef CONFIG_HOTPLUG_CPU
4189static void __perf_counter_exit_cpu(void *info)
4190{
4191 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4192 struct perf_counter_context *ctx = &cpuctx->ctx;
4193 struct perf_counter *counter, *tmp;
4194
4195 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4196 __perf_counter_remove_from_context(counter);
4197}
4198static void perf_counter_exit_cpu(int cpu)
4199{
4200 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4201 struct perf_counter_context *ctx = &cpuctx->ctx;
4202
4203 mutex_lock(&ctx->mutex);
4204 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4205 mutex_unlock(&ctx->mutex);
4206}
4207#else
4208static inline void perf_counter_exit_cpu(int cpu) { }
4209#endif
4210
4211static int __cpuinit
4212perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4213{
4214 unsigned int cpu = (long)hcpu;
4215
4216 switch (action) {
4217
4218 case CPU_UP_PREPARE:
4219 case CPU_UP_PREPARE_FROZEN:
4220 perf_counter_init_cpu(cpu);
4221 break;
4222
4223 case CPU_DOWN_PREPARE:
4224 case CPU_DOWN_PREPARE_FROZEN:
4225 perf_counter_exit_cpu(cpu);
4226 break;
4227
4228 default:
4229 break;
4230 }
4231
4232 return NOTIFY_OK;
4233}
4234
4235/*
4236 * This has to have a higher priority than migration_notifier in sched.c.
4237 */
4238static struct notifier_block __cpuinitdata perf_cpu_nb = {
4239 .notifier_call = perf_cpu_notify,
4240 .priority = 20,
4241};
4242
4243void __init perf_counter_init(void)
4244{
4245 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4246 (void *)(long)smp_processor_id());
4247 register_cpu_notifier(&perf_cpu_nb);
4248}
4249
4250static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4251{
4252 return sprintf(buf, "%d\n", perf_reserved_percpu);
4253}
4254
4255static ssize_t
4256perf_set_reserve_percpu(struct sysdev_class *class,
4257 const char *buf,
4258 size_t count)
4259{
4260 struct perf_cpu_context *cpuctx;
4261 unsigned long val;
4262 int err, cpu, mpt;
4263
4264 err = strict_strtoul(buf, 10, &val);
4265 if (err)
4266 return err;
4267 if (val > perf_max_counters)
4268 return -EINVAL;
4269
4270 spin_lock(&perf_resource_lock);
4271 perf_reserved_percpu = val;
4272 for_each_online_cpu(cpu) {
4273 cpuctx = &per_cpu(perf_cpu_context, cpu);
4274 spin_lock_irq(&cpuctx->ctx.lock);
4275 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4276 perf_max_counters - perf_reserved_percpu);
4277 cpuctx->max_pertask = mpt;
4278 spin_unlock_irq(&cpuctx->ctx.lock);
4279 }
4280 spin_unlock(&perf_resource_lock);
4281
4282 return count;
4283}
4284
4285static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4286{
4287 return sprintf(buf, "%d\n", perf_overcommit);
4288}
4289
4290static ssize_t
4291perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4292{
4293 unsigned long val;
4294 int err;
4295
4296 err = strict_strtoul(buf, 10, &val);
4297 if (err)
4298 return err;
4299 if (val > 1)
4300 return -EINVAL;
4301
4302 spin_lock(&perf_resource_lock);
4303 perf_overcommit = val;
4304 spin_unlock(&perf_resource_lock);
4305
4306 return count;
4307}
4308
4309static SYSDEV_CLASS_ATTR(
4310 reserve_percpu,
4311 0644,
4312 perf_show_reserve_percpu,
4313 perf_set_reserve_percpu
4314 );
4315
4316static SYSDEV_CLASS_ATTR(
4317 overcommit,
4318 0644,
4319 perf_show_overcommit,
4320 perf_set_overcommit
4321 );
4322
4323static struct attribute *perfclass_attrs[] = {
4324 &attr_reserve_percpu.attr,
4325 &attr_overcommit.attr,
4326 NULL
4327};
4328
4329static struct attribute_group perfclass_attr_group = {
4330 .attrs = perfclass_attrs,
4331 .name = "perf_counters",
4332};
4333
4334static int __init perf_counter_sysfs_init(void)
4335{
4336 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4337 &perfclass_attr_group);
4338}
4339device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96b..72067cbdb37f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
116 116
117 Turning OFF this setting is NOT recommended! If in doubt, say Y. 117 Turning OFF this setting is NOT recommended! If in doubt, say Y.
118 118
119config HIBERNATION_NVS
120 bool
121
119config HIBERNATION 122config HIBERNATION
120 bool "Hibernation (aka 'suspend to disk')" 123 bool "Hibernation (aka 'suspend to disk')"
121 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 124 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
125 select HIBERNATION_NVS if HAS_IOMEM
122 ---help--- 126 ---help---
123 Enable the suspend to disk (STD) functionality, which is usually 127 Enable the suspend to disk (STD) functionality, which is usually
124 called "hibernation" in user interfaces. STD checkpoints the 128 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781bd..c3b81c30e5d5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,9 @@ endif
6obj-$(CONFIG_PM) += main.o 6obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
10 13
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index e71ca9cd81b2..81d2e7464893 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * kernel/power/disk.c - Suspend-to-disk support. 2 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
7 * 8 *
8 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
9 *
10 */ 10 */
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -215,19 +215,17 @@ static int create_image(int platform_mode)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 217
218 device_pm_lock(); 218 /* At this point, dpm_suspend_start() has been called, but *not*
219 219 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
220 /* At this point, device_suspend() has been called, but *not*
221 * device_power_down(). We *must* call device_power_down() now.
222 * Otherwise, drivers for some devices (e.g. interrupt controllers) 220 * Otherwise, drivers for some devices (e.g. interrupt controllers)
223 * become desynchronized with the actual state of the hardware 221 * become desynchronized with the actual state of the hardware
224 * at resume time, and evil weirdness ensues. 222 * at resume time, and evil weirdness ensues.
225 */ 223 */
226 error = device_power_down(PMSG_FREEZE); 224 error = dpm_suspend_noirq(PMSG_FREEZE);
227 if (error) { 225 if (error) {
228 printk(KERN_ERR "PM: Some devices failed to power down, " 226 printk(KERN_ERR "PM: Some devices failed to power down, "
229 "aborting hibernation\n"); 227 "aborting hibernation\n");
230 goto Unlock; 228 return error;
231 } 229 }
232 230
233 error = platform_pre_snapshot(platform_mode); 231 error = platform_pre_snapshot(platform_mode);
@@ -241,9 +239,9 @@ static int create_image(int platform_mode)
241 239
242 local_irq_disable(); 240 local_irq_disable();
243 241
244 sysdev_suspend(PMSG_FREEZE); 242 error = sysdev_suspend(PMSG_FREEZE);
245 if (error) { 243 if (error) {
246 printk(KERN_ERR "PM: Some devices failed to power down, " 244 printk(KERN_ERR "PM: Some system devices failed to power down, "
247 "aborting hibernation\n"); 245 "aborting hibernation\n");
248 goto Enable_irqs; 246 goto Enable_irqs;
249 } 247 }
@@ -264,7 +262,7 @@ static int create_image(int platform_mode)
264 262
265 Power_up: 263 Power_up:
266 sysdev_resume(); 264 sysdev_resume();
267 /* NOTE: device_power_up() is just a resume() for devices 265 /* NOTE: dpm_resume_noirq() is just a resume() for devices
268 * that suspended with irqs off ... no overall powerup. 266 * that suspended with irqs off ... no overall powerup.
269 */ 267 */
270 268
@@ -277,12 +275,9 @@ static int create_image(int platform_mode)
277 Platform_finish: 275 Platform_finish:
278 platform_finish(platform_mode); 276 platform_finish(platform_mode);
279 277
280 device_power_up(in_suspend ? 278 dpm_resume_noirq(in_suspend ?
281 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 279 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
282 280
283 Unlock:
284 device_pm_unlock();
285
286 return error; 281 return error;
287} 282}
288 283
@@ -309,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
309 goto Close; 304 goto Close;
310 305
311 suspend_console(); 306 suspend_console();
312 error = device_suspend(PMSG_FREEZE); 307 error = dpm_suspend_start(PMSG_FREEZE);
313 if (error) 308 if (error)
314 goto Recover_platform; 309 goto Recover_platform;
315 310
@@ -320,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
320 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
321 316
322 Resume_devices: 317 Resume_devices:
323 device_resume(in_suspend ? 318 dpm_resume_end(in_suspend ?
324 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
325 resume_console(); 320 resume_console();
326 Close: 321 Close:
@@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode)
344{ 339{
345 int error; 340 int error;
346 341
347 device_pm_lock(); 342 error = dpm_suspend_noirq(PMSG_QUIESCE);
348
349 error = device_power_down(PMSG_QUIESCE);
350 if (error) { 343 if (error) {
351 printk(KERN_ERR "PM: Some devices failed to power down, " 344 printk(KERN_ERR "PM: Some devices failed to power down, "
352 "aborting resume\n"); 345 "aborting resume\n");
353 goto Unlock; 346 return error;
354 } 347 }
355 348
356 error = platform_pre_restore(platform_mode); 349 error = platform_pre_restore(platform_mode);
@@ -401,10 +394,7 @@ static int resume_target_kernel(bool platform_mode)
401 Cleanup: 394 Cleanup:
402 platform_restore_cleanup(platform_mode); 395 platform_restore_cleanup(platform_mode);
403 396
404 device_power_up(PMSG_RECOVER); 397 dpm_resume_noirq(PMSG_RECOVER);
405
406 Unlock:
407 device_pm_unlock();
408 398
409 return error; 399 return error;
410} 400}
@@ -424,10 +414,10 @@ int hibernation_restore(int platform_mode)
424 414
425 pm_prepare_console(); 415 pm_prepare_console();
426 suspend_console(); 416 suspend_console();
427 error = device_suspend(PMSG_QUIESCE); 417 error = dpm_suspend_start(PMSG_QUIESCE);
428 if (!error) { 418 if (!error) {
429 error = resume_target_kernel(platform_mode); 419 error = resume_target_kernel(platform_mode);
430 device_resume(PMSG_RECOVER); 420 dpm_resume_end(PMSG_RECOVER);
431 } 421 }
432 resume_console(); 422 resume_console();
433 pm_restore_console(); 423 pm_restore_console();
@@ -457,18 +447,16 @@ int hibernation_platform_enter(void)
457 447
458 entering_platform_hibernation = true; 448 entering_platform_hibernation = true;
459 suspend_console(); 449 suspend_console();
460 error = device_suspend(PMSG_HIBERNATE); 450 error = dpm_suspend_start(PMSG_HIBERNATE);
461 if (error) { 451 if (error) {
462 if (hibernation_ops->recover) 452 if (hibernation_ops->recover)
463 hibernation_ops->recover(); 453 hibernation_ops->recover();
464 goto Resume_devices; 454 goto Resume_devices;
465 } 455 }
466 456
467 device_pm_lock(); 457 error = dpm_suspend_noirq(PMSG_HIBERNATE);
468
469 error = device_power_down(PMSG_HIBERNATE);
470 if (error) 458 if (error)
471 goto Unlock; 459 goto Resume_devices;
472 460
473 error = hibernation_ops->prepare(); 461 error = hibernation_ops->prepare();
474 if (error) 462 if (error)
@@ -491,14 +479,11 @@ int hibernation_platform_enter(void)
491 Platofrm_finish: 479 Platofrm_finish:
492 hibernation_ops->finish(); 480 hibernation_ops->finish();
493 481
494 device_power_up(PMSG_RESTORE); 482 dpm_suspend_noirq(PMSG_RESTORE);
495
496 Unlock:
497 device_pm_unlock();
498 483
499 Resume_devices: 484 Resume_devices:
500 entering_platform_hibernation = false; 485 entering_platform_hibernation = false;
501 device_resume(PMSG_RESTORE); 486 dpm_resume_end(PMSG_RESTORE);
502 resume_console(); 487 resume_console();
503 488
504 Close: 489 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 000000000000..39ac698ef836
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/suspend.h>
14
15/*
16 * Platforms, like ACPI, may want us to save some memory used by them during
17 * hibernation and to restore the contents of this memory during the subsequent
18 * resume. The code below implements a mechanism allowing us to do that.
19 */
20
21struct nvs_page {
22 unsigned long phys_start;
23 unsigned int size;
24 void *kaddr;
25 void *data;
26 struct list_head node;
27};
28
29static LIST_HEAD(nvs_list);
30
31/**
32 * hibernate_nvs_register - register platform NVS memory region to save
33 * @start - physical address of the region
34 * @size - size of the region
35 *
36 * The NVS region need not be page-aligned (both ends) and we arrange
37 * things so that the data from page-aligned addresses in this region will
38 * be copied into separate RAM pages.
39 */
40int hibernate_nvs_register(unsigned long start, unsigned long size)
41{
42 struct nvs_page *entry, *next;
43
44 while (size > 0) {
45 unsigned int nr_bytes;
46
47 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
48 if (!entry)
49 goto Error;
50
51 list_add_tail(&entry->node, &nvs_list);
52 entry->phys_start = start;
53 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
54 entry->size = (size < nr_bytes) ? size : nr_bytes;
55
56 start += entry->size;
57 size -= entry->size;
58 }
59 return 0;
60
61 Error:
62 list_for_each_entry_safe(entry, next, &nvs_list, node) {
63 list_del(&entry->node);
64 kfree(entry);
65 }
66 return -ENOMEM;
67}
68
69/**
70 * hibernate_nvs_free - free data pages allocated for saving NVS regions
71 */
72void hibernate_nvs_free(void)
73{
74 struct nvs_page *entry;
75
76 list_for_each_entry(entry, &nvs_list, node)
77 if (entry->data) {
78 free_page((unsigned long)entry->data);
79 entry->data = NULL;
80 if (entry->kaddr) {
81 iounmap(entry->kaddr);
82 entry->kaddr = NULL;
83 }
84 }
85}
86
87/**
88 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
89 */
90int hibernate_nvs_alloc(void)
91{
92 struct nvs_page *entry;
93
94 list_for_each_entry(entry, &nvs_list, node) {
95 entry->data = (void *)__get_free_page(GFP_KERNEL);
96 if (!entry->data) {
97 hibernate_nvs_free();
98 return -ENOMEM;
99 }
100 }
101 return 0;
102}
103
104/**
105 * hibernate_nvs_save - save NVS memory regions
106 */
107void hibernate_nvs_save(void)
108{
109 struct nvs_page *entry;
110
111 printk(KERN_INFO "PM: Saving platform NVS memory\n");
112
113 list_for_each_entry(entry, &nvs_list, node)
114 if (entry->data) {
115 entry->kaddr = ioremap(entry->phys_start, entry->size);
116 memcpy(entry->data, entry->kaddr, entry->size);
117 }
118}
119
120/**
121 * hibernate_nvs_restore - restore NVS memory regions
122 *
123 * This function is going to be called with interrupts disabled, so it
124 * cannot iounmap the virtual addresses used to access the NVS region.
125 */
126void hibernate_nvs_restore(void)
127{
128 struct nvs_page *entry;
129
130 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
131
132 list_for_each_entry(entry, &nvs_list, node)
133 if (entry->data)
134 memcpy(entry->kaddr, entry->data, entry->size);
135}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f99ed6a75eac..f710e36930cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
12#include <linux/suspend.h>
13#include <linux/kobject.h> 11#include <linux/kobject.h>
14#include <linux/string.h> 12#include <linux/string.h>
15#include <linux/delay.h>
16#include <linux/errno.h>
17#include <linux/kmod.h>
18#include <linux/init.h>
19#include <linux/console.h>
20#include <linux/cpu.h>
21#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
22#include <linux/freezer.h>
23#include <linux/vmstat.h>
24#include <linux/syscalls.h>
25 14
26#include "power.h" 15#include "power.h"
27 16
@@ -119,378 +108,6 @@ power_attr(pm_test);
119 108
120#endif /* CONFIG_PM_SLEEP */ 109#endif /* CONFIG_PM_SLEEP */
121 110
122#ifdef CONFIG_SUSPEND
123
124static int suspend_test(int level)
125{
126#ifdef CONFIG_PM_DEBUG
127 if (pm_test_level == level) {
128 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
129 mdelay(5000);
130 return 1;
131 }
132#endif /* !CONFIG_PM_DEBUG */
133 return 0;
134}
135
136#ifdef CONFIG_PM_TEST_SUSPEND
137
138/*
139 * We test the system suspend code by setting an RTC wakealarm a short
140 * time in the future, then suspending. Suspending the devices won't
141 * normally take long ... some systems only need a few milliseconds.
142 *
143 * The time it takes is system-specific though, so when we test this
144 * during system bootup we allow a LOT of time.
145 */
146#define TEST_SUSPEND_SECONDS 5
147
148static unsigned long suspend_test_start_time;
149
150static void suspend_test_start(void)
151{
152 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
153 * What we want is a hardware counter that will work correctly even
154 * during the irqs-are-off stages of the suspend/resume cycle...
155 */
156 suspend_test_start_time = jiffies;
157}
158
159static void suspend_test_finish(const char *label)
160{
161 long nj = jiffies - suspend_test_start_time;
162 unsigned msec;
163
164 msec = jiffies_to_msecs(abs(nj));
165 pr_info("PM: %s took %d.%03d seconds\n", label,
166 msec / 1000, msec % 1000);
167
168 /* Warning on suspend means the RTC alarm period needs to be
169 * larger -- the system was sooo slooowwww to suspend that the
170 * alarm (should have) fired before the system went to sleep!
171 *
172 * Warning on either suspend or resume also means the system
173 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk...
175 */
176 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
177}
178
179#else
180
181static void suspend_test_start(void)
182{
183}
184
185static void suspend_test_finish(const char *label)
186{
187}
188
189#endif
190
191/* This is just an arbitrary number */
192#define FREE_PAGE_NUMBER (100)
193
194static struct platform_suspend_ops *suspend_ops;
195
196/**
197 * suspend_set_ops - Set the global suspend method table.
198 * @ops: Pointer to ops structure.
199 */
200
201void suspend_set_ops(struct platform_suspend_ops *ops)
202{
203 mutex_lock(&pm_mutex);
204 suspend_ops = ops;
205 mutex_unlock(&pm_mutex);
206}
207
208/**
209 * suspend_valid_only_mem - generic memory-only valid callback
210 *
211 * Platform drivers that implement mem suspend only and only need
212 * to check for that in their .valid callback can use this instead
213 * of rolling their own .valid callback.
214 */
215int suspend_valid_only_mem(suspend_state_t state)
216{
217 return state == PM_SUSPEND_MEM;
218}
219
220/**
221 * suspend_prepare - Do prep work before entering low-power state.
222 *
223 * This is common code that is called for each state that we're entering.
224 * Run suspend notifiers, allocate a console and stop all processes.
225 */
226static int suspend_prepare(void)
227{
228 int error;
229 unsigned int free_pages;
230
231 if (!suspend_ops || !suspend_ops->enter)
232 return -EPERM;
233
234 pm_prepare_console();
235
236 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
237 if (error)
238 goto Finish;
239
240 error = usermodehelper_disable();
241 if (error)
242 goto Finish;
243
244 if (suspend_freeze_processes()) {
245 error = -EAGAIN;
246 goto Thaw;
247 }
248
249 free_pages = global_page_state(NR_FREE_PAGES);
250 if (free_pages < FREE_PAGE_NUMBER) {
251 pr_debug("PM: free some memory\n");
252 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
253 if (nr_free_pages() < FREE_PAGE_NUMBER) {
254 error = -ENOMEM;
255 printk(KERN_ERR "PM: No enough memory\n");
256 }
257 }
258 if (!error)
259 return 0;
260
261 Thaw:
262 suspend_thaw_processes();
263 usermodehelper_enable();
264 Finish:
265 pm_notifier_call_chain(PM_POST_SUSPEND);
266 pm_restore_console();
267 return error;
268}
269
270/* default implementation */
271void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
272{
273 local_irq_disable();
274}
275
276/* default implementation */
277void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
278{
279 local_irq_enable();
280}
281
282/**
283 * suspend_enter - enter the desired system sleep state.
284 * @state: state to enter
285 *
286 * This function should be called after devices have been suspended.
287 */
288static int suspend_enter(suspend_state_t state)
289{
290 int error;
291
292 device_pm_lock();
293
294 if (suspend_ops->prepare) {
295 error = suspend_ops->prepare();
296 if (error)
297 goto Done;
298 }
299
300 error = device_power_down(PMSG_SUSPEND);
301 if (error) {
302 printk(KERN_ERR "PM: Some devices failed to power down\n");
303 goto Platfrom_finish;
304 }
305
306 if (suspend_ops->prepare_late) {
307 error = suspend_ops->prepare_late();
308 if (error)
309 goto Power_up_devices;
310 }
311
312 if (suspend_test(TEST_PLATFORM))
313 goto Platform_wake;
314
315 error = disable_nonboot_cpus();
316 if (error || suspend_test(TEST_CPUS))
317 goto Enable_cpus;
318
319 arch_suspend_disable_irqs();
320 BUG_ON(!irqs_disabled());
321
322 error = sysdev_suspend(PMSG_SUSPEND);
323 if (!error) {
324 if (!suspend_test(TEST_CORE))
325 error = suspend_ops->enter(state);
326 sysdev_resume();
327 }
328
329 arch_suspend_enable_irqs();
330 BUG_ON(irqs_disabled());
331
332 Enable_cpus:
333 enable_nonboot_cpus();
334
335 Platform_wake:
336 if (suspend_ops->wake)
337 suspend_ops->wake();
338
339 Power_up_devices:
340 device_power_up(PMSG_RESUME);
341
342 Platfrom_finish:
343 if (suspend_ops->finish)
344 suspend_ops->finish();
345
346 Done:
347 device_pm_unlock();
348
349 return error;
350}
351
352/**
353 * suspend_devices_and_enter - suspend devices and enter the desired system
354 * sleep state.
355 * @state: state to enter
356 */
357int suspend_devices_and_enter(suspend_state_t state)
358{
359 int error;
360
361 if (!suspend_ops)
362 return -ENOSYS;
363
364 if (suspend_ops->begin) {
365 error = suspend_ops->begin(state);
366 if (error)
367 goto Close;
368 }
369 suspend_console();
370 suspend_test_start();
371 error = device_suspend(PMSG_SUSPEND);
372 if (error) {
373 printk(KERN_ERR "PM: Some devices failed to suspend\n");
374 goto Recover_platform;
375 }
376 suspend_test_finish("suspend devices");
377 if (suspend_test(TEST_DEVICES))
378 goto Recover_platform;
379
380 suspend_enter(state);
381
382 Resume_devices:
383 suspend_test_start();
384 device_resume(PMSG_RESUME);
385 suspend_test_finish("resume devices");
386 resume_console();
387 Close:
388 if (suspend_ops->end)
389 suspend_ops->end();
390 return error;
391
392 Recover_platform:
393 if (suspend_ops->recover)
394 suspend_ops->recover();
395 goto Resume_devices;
396}
397
398/**
399 * suspend_finish - Do final work before exiting suspend sequence.
400 *
401 * Call platform code to clean up, restart processes, and free the
402 * console that we've allocated. This is not called for suspend-to-disk.
403 */
404static void suspend_finish(void)
405{
406 suspend_thaw_processes();
407 usermodehelper_enable();
408 pm_notifier_call_chain(PM_POST_SUSPEND);
409 pm_restore_console();
410}
411
412
413
414
415static const char * const pm_states[PM_SUSPEND_MAX] = {
416 [PM_SUSPEND_STANDBY] = "standby",
417 [PM_SUSPEND_MEM] = "mem",
418};
419
420static inline int valid_state(suspend_state_t state)
421{
422 /* All states need lowlevel support and need to be valid
423 * to the lowlevel implementation, no valid callback
424 * implies that none are valid. */
425 if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
426 return 0;
427 return 1;
428}
429
430
431/**
432 * enter_state - Do common work of entering low-power state.
433 * @state: pm_state structure for state we're entering.
434 *
435 * Make sure we're the only ones trying to enter a sleep state. Fail
436 * if someone has beat us to it, since we don't want anything weird to
437 * happen when we wake up.
438 * Then, do the setup for suspend, enter the state, and cleaup (after
439 * we've woken up).
440 */
441static int enter_state(suspend_state_t state)
442{
443 int error;
444
445 if (!valid_state(state))
446 return -ENODEV;
447
448 if (!mutex_trylock(&pm_mutex))
449 return -EBUSY;
450
451 printk(KERN_INFO "PM: Syncing filesystems ... ");
452 sys_sync();
453 printk("done.\n");
454
455 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
456 error = suspend_prepare();
457 if (error)
458 goto Unlock;
459
460 if (suspend_test(TEST_FREEZER))
461 goto Finish;
462
463 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
464 error = suspend_devices_and_enter(state);
465
466 Finish:
467 pr_debug("PM: Finishing wakeup.\n");
468 suspend_finish();
469 Unlock:
470 mutex_unlock(&pm_mutex);
471 return error;
472}
473
474
475/**
476 * pm_suspend - Externally visible function for suspending system.
477 * @state: Enumerated value of state to enter.
478 *
479 * Determine whether or not value is within range, get state
480 * structure, and enter (above).
481 */
482
483int pm_suspend(suspend_state_t state)
484{
485 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
486 return enter_state(state);
487 return -EINVAL;
488}
489
490EXPORT_SYMBOL(pm_suspend);
491
492#endif /* CONFIG_SUSPEND */
493
494struct kobject *power_kobj; 111struct kobject *power_kobj;
495 112
496/** 113/**
@@ -503,7 +120,6 @@ struct kobject *power_kobj;
503 * store() accepts one of those strings, translates it into the 120 * store() accepts one of those strings, translates it into the
504 * proper enumerated value, and initiates a suspend transition. 121 * proper enumerated value, and initiates a suspend transition.
505 */ 122 */
506
507static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 123static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
508 char *buf) 124 char *buf)
509{ 125{
@@ -601,7 +217,6 @@ static struct attribute_group attr_group = {
601 .attrs = g, 217 .attrs = g,
602}; 218};
603 219
604
605static int __init pm_init(void) 220static int __init pm_init(void)
606{ 221{
607 power_kobj = kobject_create_and_add("power", NULL); 222 power_kobj = kobject_create_and_add("power", NULL);
@@ -611,144 +226,3 @@ static int __init pm_init(void)
611} 226}
612 227
613core_initcall(pm_init); 228core_initcall(pm_init);
614
615
616#ifdef CONFIG_PM_TEST_SUSPEND
617
618#include <linux/rtc.h>
619
620/*
621 * To test system suspend, we need a hands-off mechanism to resume the
622 * system. RTCs wake alarms are a common self-contained mechanism.
623 */
624
625static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626{
627 static char err_readtime[] __initdata =
628 KERN_ERR "PM: can't read %s time, err %d\n";
629 static char err_wakealarm [] __initdata =
630 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
631 static char err_suspend[] __initdata =
632 KERN_ERR "PM: suspend test failed, error %d\n";
633 static char info_test[] __initdata =
634 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
635
636 unsigned long now;
637 struct rtc_wkalrm alm;
638 int status;
639
640 /* this may fail if the RTC hasn't been initialized */
641 status = rtc_read_time(rtc, &alm.time);
642 if (status < 0) {
643 printk(err_readtime, dev_name(&rtc->dev), status);
644 return;
645 }
646 rtc_tm_to_time(&alm.time, &now);
647
648 memset(&alm, 0, sizeof alm);
649 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
650 alm.enabled = true;
651
652 status = rtc_set_alarm(rtc, &alm);
653 if (status < 0) {
654 printk(err_wakealarm, dev_name(&rtc->dev), status);
655 return;
656 }
657
658 if (state == PM_SUSPEND_MEM) {
659 printk(info_test, pm_states[state]);
660 status = pm_suspend(state);
661 if (status == -ENODEV)
662 state = PM_SUSPEND_STANDBY;
663 }
664 if (state == PM_SUSPEND_STANDBY) {
665 printk(info_test, pm_states[state]);
666 status = pm_suspend(state);
667 }
668 if (status < 0)
669 printk(err_suspend, status);
670
671 /* Some platforms can't detect that the alarm triggered the
672 * wakeup, or (accordingly) disable it after it afterwards.
673 * It's supposed to give oneshot behavior; cope.
674 */
675 alm.enabled = false;
676 rtc_set_alarm(rtc, &alm);
677}
678
679static int __init has_wakealarm(struct device *dev, void *name_ptr)
680{
681 struct rtc_device *candidate = to_rtc_device(dev);
682
683 if (!candidate->ops->set_alarm)
684 return 0;
685 if (!device_may_wakeup(candidate->dev.parent))
686 return 0;
687
688 *(const char **)name_ptr = dev_name(dev);
689 return 1;
690}
691
692/*
693 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
694 * at startup time. They're normally disabled, for faster boot and because
695 * we can't know which states really work on this particular system.
696 */
697static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
698
699static char warn_bad_state[] __initdata =
700 KERN_WARNING "PM: can't test '%s' suspend state\n";
701
702static int __init setup_test_suspend(char *value)
703{
704 unsigned i;
705
706 /* "=mem" ==> "mem" */
707 value++;
708 for (i = 0; i < PM_SUSPEND_MAX; i++) {
709 if (!pm_states[i])
710 continue;
711 if (strcmp(pm_states[i], value) != 0)
712 continue;
713 test_state = (__force suspend_state_t) i;
714 return 0;
715 }
716 printk(warn_bad_state, value);
717 return 0;
718}
719__setup("test_suspend", setup_test_suspend);
720
721static int __init test_suspend(void)
722{
723 static char warn_no_rtc[] __initdata =
724 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
725
726 char *pony = NULL;
727 struct rtc_device *rtc = NULL;
728
729 /* PM is initialized by now; is that state testable? */
730 if (test_state == PM_SUSPEND_ON)
731 goto done;
732 if (!valid_state(test_state)) {
733 printk(warn_bad_state, pm_states[test_state]);
734 goto done;
735 }
736
737 /* RTCs have initialized by now too ... can we use one? */
738 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
739 if (pony)
740 rtc = rtc_class_open(pony);
741 if (!rtc) {
742 printk(warn_no_rtc);
743 goto done;
744 }
745
746 /* go for it */
747 test_wakealarm(rtc, test_state);
748 rtc_class_close(rtc);
749done:
750 return 0;
751}
752late_initcall(test_suspend);
753
754#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3afb..26d5a26f82e3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
45 */ 45 */
46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
47 47
48/* kernel/power/disk.c */ 48/* kernel/power/hibernate.c */
49extern int hibernation_snapshot(int platform_mode); 49extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 50extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 51extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern unsigned int count_data_pages(void); 77extern int swsusp_shrink_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
147 */ 147 */
148#define SF_PLATFORM_MODE 1 148#define SF_PLATFORM_MODE 1
149 149
150/* kernel/power/disk.c */ 150/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 151extern int swsusp_check(void);
152extern int swsusp_shrink_memory(void);
153extern void swsusp_free(void); 152extern void swsusp_free(void);
154extern int swsusp_read(unsigned int *flags_p); 153extern int swsusp_read(unsigned int *flags_p);
155extern int swsusp_write(unsigned int flags); 154extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
161 unsigned int, char *); 160 unsigned int, char *);
162 161
163#ifdef CONFIG_SUSPEND 162#ifdef CONFIG_SUSPEND
164/* kernel/power/main.c */ 163/* kernel/power/suspend.c */
164extern const char *const pm_states[];
165
166extern bool valid_state(suspend_state_t state);
165extern int suspend_devices_and_enter(suspend_state_t state); 167extern int suspend_devices_and_enter(suspend_state_t state);
168extern int enter_state(suspend_state_t state);
166#else /* !CONFIG_SUSPEND */ 169#else /* !CONFIG_SUSPEND */
167static inline int suspend_devices_and_enter(suspend_state_t state) 170static inline int suspend_devices_and_enter(suspend_state_t state)
168{ 171{
169 return -ENOSYS; 172 return -ENOSYS;
170} 173}
174static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
175static inline bool valid_state(suspend_state_t state) { return false; }
171#endif /* !CONFIG_SUSPEND */ 176#endif /* !CONFIG_SUSPEND */
172 177
178#ifdef CONFIG_PM_TEST_SUSPEND
179/* kernel/power/suspend_test.c */
180extern void suspend_test_start(void);
181extern void suspend_test_finish(const char *label);
182#else /* !CONFIG_PM_TEST_SUSPEND */
183static inline void suspend_test_start(void) {}
184static inline void suspend_test_finish(const char *label) {}
185#endif /* !CONFIG_PM_TEST_SUSPEND */
186
173#ifdef CONFIG_PM_SLEEP 187#ifdef CONFIG_PM_SLEEP
174/* kernel/power/main.c */ 188/* kernel/power/main.c */
175extern int pm_notifier_call_chain(unsigned long val); 189extern int pm_notifier_call_chain(unsigned long val);
176#endif 190#endif
177 191
178#ifdef CONFIG_HIGHMEM 192#ifdef CONFIG_HIGHMEM
179unsigned int count_highmem_pages(void);
180int restore_highmem(void); 193int restore_highmem(void);
181#else 194#else
182static inline unsigned int count_highmem_pages(void) { return 0; } 195static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "powerOff",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
117 if (error) 117 if (error)
118 goto Exit; 118 goto Exit;
119 printk("done."); 119 printk("done.");
120
121 oom_killer_disable();
120 Exit: 122 Exit:
121 BUG_ON(in_atomic()); 123 BUG_ON(in_atomic());
122 printk("\n"); 124 printk("\n");
125
123 return error; 126 return error;
124} 127}
125 128
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
145 148
146void thaw_processes(void) 149void thaw_processes(void)
147{ 150{
151 oom_killer_enable();
152
148 printk("Restarting tasks ... "); 153 printk("Restarting tasks ... ");
149 thaw_tasks(true); 154 thaw_tasks(true);
150 thaw_tasks(false); 155 thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f9..523a451b45d3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
39static void swsusp_set_page_forbidden(struct page *); 39static void swsusp_set_page_forbidden(struct page *);
40static void swsusp_unset_page_forbidden(struct page *); 40static void swsusp_unset_page_forbidden(struct page *);
41 41
42/*
43 * Preferred image size in bytes (tunable via /sys/power/image_size).
44 * When it is set to N, swsusp will do its best to ensure the image
45 * size will not exceed N bytes, but if that is impossible, it will
46 * try to create the smallest image possible.
47 */
48unsigned long image_size = 500 * 1024 * 1024;
49
42/* List of PBEs needed for restoring the pages that were allocated before 50/* List of PBEs needed for restoring the pages that were allocated before
43 * the suspend and included in the suspend image, but have also been 51 * the suspend and included in the suspend image, but have also been
44 * allocated by the "resume" kernel, so their contents cannot be written 52 * allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
840 * pages. 848 * pages.
841 */ 849 */
842 850
843unsigned int count_highmem_pages(void) 851static unsigned int count_highmem_pages(void)
844{ 852{
845 struct zone *zone; 853 struct zone *zone;
846 unsigned int n = 0; 854 unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
902 * pages. 910 * pages.
903 */ 911 */
904 912
905unsigned int count_data_pages(void) 913static unsigned int count_data_pages(void)
906{ 914{
907 struct zone *zone; 915 struct zone *zone;
908 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
1058 buffer = NULL; 1066 buffer = NULL;
1059} 1067}
1060 1068
1069/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed
1071 *
1072 * ... but do not OOM-kill anyone
1073 *
1074 * Notice: all userland should be stopped before it is called, or
1075 * livelock is possible.
1076 */
1077
1078#define SHRINK_BITE 10000
1079static inline unsigned long __shrink_memory(long tmp)
1080{
1081 if (tmp > SHRINK_BITE)
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084}
1085
1086int swsusp_shrink_memory(void)
1087{
1088 long tmp;
1089 struct zone *zone;
1090 unsigned long pages = 0;
1091 unsigned int i = 0;
1092 char *p = "-\\|/";
1093 struct timeval start, stop;
1094
1095 printk(KERN_INFO "PM: Shrinking memory... ");
1096 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114
1115 if (highmem_size < 0)
1116 highmem_size = 0;
1117
1118 tmp += highmem_size;
1119 if (tmp > 0) {
1120 tmp = __shrink_memory(tmp);
1121 if (!tmp)
1122 return -ENOMEM;
1123 pages += tmp;
1124 } else if (size > image_size / PAGE_SIZE) {
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
1126 pages += tmp;
1127 }
1128 printk("\b%c", p[i++%4]);
1129 } while (tmp > 0);
1130 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed");
1133
1134 return 0;
1135}
1136
1061#ifdef CONFIG_HIGHMEM 1137#ifdef CONFIG_HIGHMEM
1062/** 1138/**
1063 * count_pages_for_highmem - compute the number of non-highmem pages 1139 * count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 000000000000..6f10dfc2d3e9
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
1/*
2 * kernel/power/suspend.c - Suspend to RAM and standby functionality.
3 *
4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/string.h>
12#include <linux/delay.h>
13#include <linux/errno.h>
14#include <linux/init.h>
15#include <linux/console.h>
16#include <linux/cpu.h>
17#include <linux/syscalls.h>
18
19#include "power.h"
20
21const char *const pm_states[PM_SUSPEND_MAX] = {
22 [PM_SUSPEND_STANDBY] = "standby",
23 [PM_SUSPEND_MEM] = "mem",
24};
25
26static struct platform_suspend_ops *suspend_ops;
27
28/**
29 * suspend_set_ops - Set the global suspend method table.
30 * @ops: Pointer to ops structure.
31 */
32void suspend_set_ops(struct platform_suspend_ops *ops)
33{
34 mutex_lock(&pm_mutex);
35 suspend_ops = ops;
36 mutex_unlock(&pm_mutex);
37}
38
39bool valid_state(suspend_state_t state)
40{
41 /*
42 * All states need lowlevel support and need to be valid to the lowlevel
43 * implementation, no valid callback implies that none are valid.
44 */
45 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
46}
47
48/**
49 * suspend_valid_only_mem - generic memory-only valid callback
50 *
51 * Platform drivers that implement mem suspend only and only need
52 * to check for that in their .valid callback can use this instead
53 * of rolling their own .valid callback.
54 */
55int suspend_valid_only_mem(suspend_state_t state)
56{
57 return state == PM_SUSPEND_MEM;
58}
59
60static int suspend_test(int level)
61{
62#ifdef CONFIG_PM_DEBUG
63 if (pm_test_level == level) {
64 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
65 mdelay(5000);
66 return 1;
67 }
68#endif /* !CONFIG_PM_DEBUG */
69 return 0;
70}
71
72/**
73 * suspend_prepare - Do prep work before entering low-power state.
74 *
75 * This is common code that is called for each state that we're entering.
76 * Run suspend notifiers, allocate a console and stop all processes.
77 */
78static int suspend_prepare(void)
79{
80 int error;
81
82 if (!suspend_ops || !suspend_ops->enter)
83 return -EPERM;
84
85 pm_prepare_console();
86
87 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
88 if (error)
89 goto Finish;
90
91 error = usermodehelper_disable();
92 if (error)
93 goto Finish;
94
95 error = suspend_freeze_processes();
96 if (!error)
97 return 0;
98
99 suspend_thaw_processes();
100 usermodehelper_enable();
101 Finish:
102 pm_notifier_call_chain(PM_POST_SUSPEND);
103 pm_restore_console();
104 return error;
105}
106
107/* default implementation */
108void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
109{
110 local_irq_disable();
111}
112
113/* default implementation */
114void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
115{
116 local_irq_enable();
117}
118
119/**
120 * suspend_enter - enter the desired system sleep state.
121 * @state: state to enter
122 *
123 * This function should be called after devices have been suspended.
124 */
125static int suspend_enter(suspend_state_t state)
126{
127 int error;
128
129 if (suspend_ops->prepare) {
130 error = suspend_ops->prepare();
131 if (error)
132 return error;
133 }
134
135 error = dpm_suspend_noirq(PMSG_SUSPEND);
136 if (error) {
137 printk(KERN_ERR "PM: Some devices failed to power down\n");
138 goto Platfrom_finish;
139 }
140
141 if (suspend_ops->prepare_late) {
142 error = suspend_ops->prepare_late();
143 if (error)
144 goto Power_up_devices;
145 }
146
147 if (suspend_test(TEST_PLATFORM))
148 goto Platform_wake;
149
150 error = disable_nonboot_cpus();
151 if (error || suspend_test(TEST_CPUS))
152 goto Enable_cpus;
153
154 arch_suspend_disable_irqs();
155 BUG_ON(!irqs_disabled());
156
157 error = sysdev_suspend(PMSG_SUSPEND);
158 if (!error) {
159 if (!suspend_test(TEST_CORE))
160 error = suspend_ops->enter(state);
161 sysdev_resume();
162 }
163
164 arch_suspend_enable_irqs();
165 BUG_ON(irqs_disabled());
166
167 Enable_cpus:
168 enable_nonboot_cpus();
169
170 Platform_wake:
171 if (suspend_ops->wake)
172 suspend_ops->wake();
173
174 Power_up_devices:
175 dpm_resume_noirq(PMSG_RESUME);
176
177 Platfrom_finish:
178 if (suspend_ops->finish)
179 suspend_ops->finish();
180
181 return error;
182}
183
184/**
185 * suspend_devices_and_enter - suspend devices and enter the desired system
186 * sleep state.
187 * @state: state to enter
188 */
189int suspend_devices_and_enter(suspend_state_t state)
190{
191 int error;
192
193 if (!suspend_ops)
194 return -ENOSYS;
195
196 if (suspend_ops->begin) {
197 error = suspend_ops->begin(state);
198 if (error)
199 goto Close;
200 }
201 suspend_console();
202 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) {
205 printk(KERN_ERR "PM: Some devices failed to suspend\n");
206 goto Recover_platform;
207 }
208 suspend_test_finish("suspend devices");
209 if (suspend_test(TEST_DEVICES))
210 goto Recover_platform;
211
212 suspend_enter(state);
213
214 Resume_devices:
215 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices");
218 resume_console();
219 Close:
220 if (suspend_ops->end)
221 suspend_ops->end();
222 return error;
223
224 Recover_platform:
225 if (suspend_ops->recover)
226 suspend_ops->recover();
227 goto Resume_devices;
228}
229
230/**
231 * suspend_finish - Do final work before exiting suspend sequence.
232 *
233 * Call platform code to clean up, restart processes, and free the
234 * console that we've allocated. This is not called for suspend-to-disk.
235 */
236static void suspend_finish(void)
237{
238 suspend_thaw_processes();
239 usermodehelper_enable();
240 pm_notifier_call_chain(PM_POST_SUSPEND);
241 pm_restore_console();
242}
243
244/**
245 * enter_state - Do common work of entering low-power state.
246 * @state: pm_state structure for state we're entering.
247 *
248 * Make sure we're the only ones trying to enter a sleep state. Fail
249 * if someone has beat us to it, since we don't want anything weird to
250 * happen when we wake up.
251 * Then, do the setup for suspend, enter the state, and cleaup (after
252 * we've woken up).
253 */
254int enter_state(suspend_state_t state)
255{
256 int error;
257
258 if (!valid_state(state))
259 return -ENODEV;
260
261 if (!mutex_trylock(&pm_mutex))
262 return -EBUSY;
263
264 printk(KERN_INFO "PM: Syncing filesystems ... ");
265 sys_sync();
266 printk("done.\n");
267
268 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
269 error = suspend_prepare();
270 if (error)
271 goto Unlock;
272
273 if (suspend_test(TEST_FREEZER))
274 goto Finish;
275
276 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
277 error = suspend_devices_and_enter(state);
278
279 Finish:
280 pr_debug("PM: Finishing wakeup.\n");
281 suspend_finish();
282 Unlock:
283 mutex_unlock(&pm_mutex);
284 return error;
285}
286
287/**
288 * pm_suspend - Externally visible function for suspending system.
289 * @state: Enumerated value of state to enter.
290 *
291 * Determine whether or not value is within range, get state
292 * structure, and enter (above).
293 */
294int pm_suspend(suspend_state_t state)
295{
296 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
297 return enter_state(state);
298 return -EINVAL;
299}
300EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 000000000000..17d8bb1acf9c
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
1/*
2 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
3 *
4 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/init.h>
10#include <linux/rtc.h>
11
12#include "power.h"
13
14/*
15 * We test the system suspend code by setting an RTC wakealarm a short
16 * time in the future, then suspending. Suspending the devices won't
17 * normally take long ... some systems only need a few milliseconds.
18 *
19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time.
21 */
22#define TEST_SUSPEND_SECONDS 5
23
24static unsigned long suspend_test_start_time;
25
26void suspend_test_start(void)
27{
28 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
29 * What we want is a hardware counter that will work correctly even
30 * during the irqs-are-off stages of the suspend/resume cycle...
31 */
32 suspend_test_start_time = jiffies;
33}
34
35void suspend_test_finish(const char *label)
36{
37 long nj = jiffies - suspend_test_start_time;
38 unsigned msec;
39
40 msec = jiffies_to_msecs(abs(nj));
41 pr_info("PM: %s took %d.%03d seconds\n", label,
42 msec / 1000, msec % 1000);
43
44 /* Warning on suspend means the RTC alarm period needs to be
45 * larger -- the system was sooo slooowwww to suspend that the
46 * alarm (should have) fired before the system went to sleep!
47 *
48 * Warning on either suspend or resume also means the system
49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk...
51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
53}
54
55/*
56 * To test system suspend, we need a hands-off mechanism to resume the
57 * system. RTCs wake alarms are a common self-contained mechanism.
58 */
59
60static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
61{
62 static char err_readtime[] __initdata =
63 KERN_ERR "PM: can't read %s time, err %d\n";
64 static char err_wakealarm [] __initdata =
65 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
66 static char err_suspend[] __initdata =
67 KERN_ERR "PM: suspend test failed, error %d\n";
68 static char info_test[] __initdata =
69 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
70
71 unsigned long now;
72 struct rtc_wkalrm alm;
73 int status;
74
75 /* this may fail if the RTC hasn't been initialized */
76 status = rtc_read_time(rtc, &alm.time);
77 if (status < 0) {
78 printk(err_readtime, dev_name(&rtc->dev), status);
79 return;
80 }
81 rtc_tm_to_time(&alm.time, &now);
82
83 memset(&alm, 0, sizeof alm);
84 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
85 alm.enabled = true;
86
87 status = rtc_set_alarm(rtc, &alm);
88 if (status < 0) {
89 printk(err_wakealarm, dev_name(&rtc->dev), status);
90 return;
91 }
92
93 if (state == PM_SUSPEND_MEM) {
94 printk(info_test, pm_states[state]);
95 status = pm_suspend(state);
96 if (status == -ENODEV)
97 state = PM_SUSPEND_STANDBY;
98 }
99 if (state == PM_SUSPEND_STANDBY) {
100 printk(info_test, pm_states[state]);
101 status = pm_suspend(state);
102 }
103 if (status < 0)
104 printk(err_suspend, status);
105
106 /* Some platforms can't detect that the alarm triggered the
107 * wakeup, or (accordingly) disable it after it afterwards.
108 * It's supposed to give oneshot behavior; cope.
109 */
110 alm.enabled = false;
111 rtc_set_alarm(rtc, &alm);
112}
113
114static int __init has_wakealarm(struct device *dev, void *name_ptr)
115{
116 struct rtc_device *candidate = to_rtc_device(dev);
117
118 if (!candidate->ops->set_alarm)
119 return 0;
120 if (!device_may_wakeup(candidate->dev.parent))
121 return 0;
122
123 *(const char **)name_ptr = dev_name(dev);
124 return 1;
125}
126
127/*
128 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
129 * at startup time. They're normally disabled, for faster boot and because
130 * we can't know which states really work on this particular system.
131 */
132static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
133
134static char warn_bad_state[] __initdata =
135 KERN_WARNING "PM: can't test '%s' suspend state\n";
136
137static int __init setup_test_suspend(char *value)
138{
139 unsigned i;
140
141 /* "=mem" ==> "mem" */
142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) {
144 if (!pm_states[i])
145 continue;
146 if (strcmp(pm_states[i], value) != 0)
147 continue;
148 test_state = (__force suspend_state_t) i;
149 return 0;
150 }
151 printk(warn_bad_state, value);
152 return 0;
153}
154__setup("test_suspend", setup_test_suspend);
155
156static int __init test_suspend(void)
157{
158 static char warn_no_rtc[] __initdata =
159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
160
161 char *pony = NULL;
162 struct rtc_device *rtc = NULL;
163
164 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON)
166 goto done;
167 if (!valid_state(test_state)) {
168 printk(warn_bad_state, pm_states[test_state]);
169 goto done;
170 }
171
172 /* RTCs have initialized by now too ... can we use one? */
173 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
174 if (pony)
175 rtc = rtc_class_open(pony);
176 if (!rtc) {
177 printk(warn_no_rtc);
178 goto done;
179 }
180
181 /* go for it */
182 test_wakealarm(rtc, test_state);
183 rtc_class_close(rtc);
184done:
185 return 0;
186}
187late_initcall(test_suspend);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586d..6a07f4dbf2f8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
55 55
56#include "power.h" 56#include "power.h"
57 57
58/*
59 * Preferred image size in bytes (tunable via /sys/power/image_size).
60 * When it is set to N, swsusp will do its best to ensure the image
61 * size will not exceed N bytes, but if that is impossible, it will
62 * try to create the smallest image possible.
63 */
64unsigned long image_size = 500 * 1024 * 1024;
65
66int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
67 59
68/** 60/**
@@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
194 centisecs / 100, centisecs % 100, 186 centisecs / 100, centisecs % 100,
195 kps / 1000, (kps % 1000) / 10); 187 kps / 1000, (kps % 1000) / 10);
196} 188}
197
198/**
199 * swsusp_shrink_memory - Try to free as much memory as needed
200 *
201 * ... but do not OOM-kill anyone
202 *
203 * Notice: all userland should be stopped before it is called, or
204 * livelock is possible.
205 */
206
207#define SHRINK_BITE 10000
208static inline unsigned long __shrink_memory(long tmp)
209{
210 if (tmp > SHRINK_BITE)
211 tmp = SHRINK_BITE;
212 return shrink_all_memory(tmp);
213}
214
215int swsusp_shrink_memory(void)
216{
217 long tmp;
218 struct zone *zone;
219 unsigned long pages = 0;
220 unsigned int i = 0;
221 char *p = "-\\|/";
222 struct timeval start, stop;
223
224 printk(KERN_INFO "PM: Shrinking memory... ");
225 do_gettimeofday(&start);
226 do {
227 long size, highmem_size;
228
229 highmem_size = count_highmem_pages();
230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
231 tmp = size;
232 size += highmem_size;
233 for_each_populated_zone(zone) {
234 tmp += snapshot_additional_pages(zone);
235 if (is_highmem(zone)) {
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES);
238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 }
243
244 if (highmem_size < 0)
245 highmem_size = 0;
246
247 tmp += highmem_size;
248 if (tmp > 0) {
249 tmp = __shrink_memory(tmp);
250 if (!tmp)
251 return -ENOMEM;
252 pages += tmp;
253 } else if (size > image_size / PAGE_SIZE) {
254 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
255 pages += tmp;
256 }
257 printk("\b%c", p[i++%4]);
258 } while (tmp > 0);
259 do_gettimeofday(&stop);
260 printk("\bdone (%lu pages freed)\n", pages);
261 swsusp_show_speed(&start, &stop, pages, "Freed");
262
263 return 0;
264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
687 sizeof(printk_buf) - printed_len, fmt, args); 687 sizeof(printk_buf) - printed_len, fmt, args);
688 688
689 689
690 p = printk_buf;
691
692 /* Do we have a loglevel in the string? */
693 if (p[0] == '<') {
694 unsigned char c = p[1];
695 if (c && p[2] == '>') {
696 switch (c) {
697 case '0' ... '7': /* loglevel */
698 current_log_level = c - '0';
699 /* Fallthrough - make sure we're on a new line */
700 case 'd': /* KERN_DEFAULT */
701 if (!new_text_line) {
702 emit_log_char('\n');
703 new_text_line = 1;
704 }
705 /* Fallthrough - skip the loglevel */
706 case 'c': /* KERN_CONT */
707 p += 3;
708 break;
709 }
710 }
711 }
712
690 /* 713 /*
691 * Copy the output into log_buf. If the caller didn't provide 714 * Copy the output into log_buf. If the caller didn't provide
692 * appropriate log level tags, we insert them here 715 * appropriate log level tags, we insert them here
693 */ 716 */
694 for (p = printk_buf; *p; p++) { 717 for ( ; *p; p++) {
695 if (new_text_line) { 718 if (new_text_line) {
696 /* If a token, set current_log_level and skip over */
697 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
698 p[2] == '>') {
699 current_log_level = p[1] - '0';
700 p += 3;
701 printed_len -= 3;
702 }
703
704 /* Always output the token */ 719 /* Always output the token */
705 emit_log_char('<'); 720 emit_log_char('<');
706 emit_log_char(current_log_level + '0'); 721 emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..69911b5745eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
111 /* only text is profiled */ 111 /* only text is profiled */
112 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
118 return 0;
119 }
120 114
121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 115 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
122 return -ENOMEM; 116 return -ENOMEM;
@@ -371,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
371 node = cpu_to_node(cpu); 365 node = cpu_to_node(cpu);
372 per_cpu(cpu_profile_flip, cpu) = 0; 366 per_cpu(cpu_profile_flip, cpu) = 0;
373 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 367 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
374 page = alloc_pages_node(node, 368 page = alloc_pages_exact_node(node,
375 GFP_KERNEL | __GFP_ZERO, 369 GFP_KERNEL | __GFP_ZERO,
376 0); 370 0);
377 if (!page) 371 if (!page)
@@ -379,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
379 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 373 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
380 } 374 }
381 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 375 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
382 page = alloc_pages_node(node, 376 page = alloc_pages_exact_node(node,
383 GFP_KERNEL | __GFP_ZERO, 377 GFP_KERNEL | __GFP_ZERO,
384 0); 378 0);
385 if (!page) 379 if (!page)
@@ -570,14 +564,14 @@ static int create_hash_tables(void)
570 int node = cpu_to_node(cpu); 564 int node = cpu_to_node(cpu);
571 struct page *page; 565 struct page *page;
572 566
573 page = alloc_pages_node(node, 567 page = alloc_pages_exact_node(node,
574 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 568 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
575 0); 569 0);
576 if (!page) 570 if (!page)
577 goto out_cleanup; 571 goto out_cleanup;
578 per_cpu(cpu_profile_hits, cpu)[1] 572 per_cpu(cpu_profile_hits, cpu)[1]
579 = (struct profile_hit *)page_address(page); 573 = (struct profile_hit *)page_address(page);
580 page = alloc_pages_node(node, 574 page = alloc_pages_exact_node(node,
581 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 575 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
582 0); 576 0);
583 if (!page) 577 if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e950805f8630..f6d8b8cb5e34 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -175,10 +175,11 @@ int ptrace_attach(struct task_struct *task)
175 if (same_thread_group(task, current)) 175 if (same_thread_group(task, current))
176 goto out; 176 goto out;
177 177
178 /* Protect exec's credential calculations against our interference; 178 /* Protect the target's credential calculations against our
179 * SUID, SGID and LSM creds get determined differently under ptrace. 179 * interference; SUID, SGID and LSM creds get determined differently
180 * under ptrace.
180 */ 181 */
181 retval = mutex_lock_interruptible(&task->cred_exec_mutex); 182 retval = mutex_lock_interruptible(&task->cred_guard_mutex);
182 if (retval < 0) 183 if (retval < 0)
183 goto out; 184 goto out;
184 185
@@ -222,7 +223,7 @@ repeat:
222bad: 223bad:
223 write_unlock_irqrestore(&tasklist_lock, flags); 224 write_unlock_irqrestore(&tasklist_lock, flags);
224 task_unlock(task); 225 task_unlock(task);
225 mutex_unlock(&task->cred_exec_mutex); 226 mutex_unlock(&task->cred_guard_mutex);
226out: 227out:
227 return retval; 228 return retval;
228} 229}
@@ -294,6 +295,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
294 if (child->ptrace) { 295 if (child->ptrace) {
295 child->exit_code = data; 296 child->exit_code = data;
296 dead = __ptrace_detach(current, child); 297 dead = __ptrace_detach(current, child);
298 if (!child->exit_state)
299 wake_up_process(child);
297 } 300 }
298 write_unlock_irq(&tasklist_lock); 301 write_unlock_irq(&tasklist_lock);
299 302
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d3..beb0e659adcc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
1356 1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; 1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); 1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; 1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq, 1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, 1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret); 1362 ret);
1363 1363
1364 /*
1365 * Signals would prevent us from sleeping, and we cannot
1366 * do much with them in any case. So flush them.
1367 */
1368 if (ret)
1369 flush_signals(current);
1370 couldsleepnext = 0; 1364 couldsleepnext = 0;
1371 1365
1372 } while (!kthread_should_stop()); 1366 } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b..0dccfbba6d26 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1259 check_cpu_stall(rsp, rdp); 1259 check_cpu_stall(rsp, rdp);
1260 1260
1261 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1261 /* Is the RCU core waiting for a quiescent state from this CPU? */
1262 if (rdp->qs_pending) 1262 if (rdp->qs_pending) {
1263 rdp->n_rp_qs_pending++;
1263 return 1; 1264 return 1;
1265 }
1264 1266
1265 /* Does this CPU have callbacks ready to invoke? */ 1267 /* Does this CPU have callbacks ready to invoke? */
1266 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1268 if (cpu_has_callbacks_ready_to_invoke(rdp)) {
1269 rdp->n_rp_cb_ready++;
1267 return 1; 1270 return 1;
1271 }
1268 1272
1269 /* Has RCU gone idle with this CPU needing another grace period? */ 1273 /* Has RCU gone idle with this CPU needing another grace period? */
1270 if (cpu_needs_another_gp(rsp, rdp)) 1274 if (cpu_needs_another_gp(rsp, rdp)) {
1275 rdp->n_rp_cpu_needs_gp++;
1271 return 1; 1276 return 1;
1277 }
1272 1278
1273 /* Has another RCU grace period completed? */ 1279 /* Has another RCU grace period completed? */
1274 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ 1280 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
1281 rdp->n_rp_gp_completed++;
1275 return 1; 1282 return 1;
1283 }
1276 1284
1277 /* Has a new RCU grace period started? */ 1285 /* Has a new RCU grace period started? */
1278 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ 1286 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
1287 rdp->n_rp_gp_started++;
1279 return 1; 1288 return 1;
1289 }
1280 1290
1281 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1291 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1282 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1292 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1283 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) 1293 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1294 rdp->n_rp_need_fqs++;
1284 return 1; 1295 return 1;
1296 }
1285 1297
1286 /* nothing to do */ 1298 /* nothing to do */
1299 rdp->n_rp_need_nothing++;
1287 return 0; 1300 return 0;
1288} 1301}
1289 1302
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba9404..fe1dcdbf1ca3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
213 .release = single_release, 213 .release = single_release,
214}; 214};
215 215
216static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; 216static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
217{
218 seq_printf(m, "%3d%cnp=%ld "
219 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
220 rdp->cpu,
221 cpu_is_offline(rdp->cpu) ? '!' : ' ',
222 rdp->n_rcu_pending,
223 rdp->n_rp_qs_pending,
224 rdp->n_rp_cb_ready,
225 rdp->n_rp_cpu_needs_gp,
226 rdp->n_rp_gp_completed,
227 rdp->n_rp_gp_started,
228 rdp->n_rp_need_fqs,
229 rdp->n_rp_need_nothing);
230}
231
232static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
233{
234 int cpu;
235 struct rcu_data *rdp;
236
237 for_each_possible_cpu(cpu) {
238 rdp = rsp->rda[cpu];
239 if (rdp->beenonline)
240 print_one_rcu_pending(m, rdp);
241 }
242}
243
244static int show_rcu_pending(struct seq_file *m, void *unused)
245{
246 seq_puts(m, "rcu:\n");
247 print_rcu_pendings(m, &rcu_state);
248 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state);
250 return 0;
251}
252
253static int rcu_pending_open(struct inode *inode, struct file *file)
254{
255 return single_open(file, show_rcu_pending, NULL);
256}
257
258static struct file_operations rcu_pending_fops = {
259 .owner = THIS_MODULE,
260 .open = rcu_pending_open,
261 .read = seq_read,
262 .llseek = seq_lseek,
263 .release = single_release,
264};
265
266static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272
217static int __init rcuclassic_trace_init(void) 273static int __init rcuclassic_trace_init(void)
218{ 274{
219 rcudir = debugfs_create_dir("rcu", NULL); 275 rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
238 NULL, &rcuhier_fops); 294 NULL, &rcuhier_fops);
239 if (!hierdir) 295 if (!hierdir)
240 goto free_out; 296 goto free_out;
297
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir)
301 goto free_out;
241 return 0; 302 return 0;
242free_out: 303free_out:
243 if (datadir) 304 if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
257 debugfs_remove(datadir_csv); 318 debugfs_remove(datadir_csv);
258 debugfs_remove(gpdir); 319 debugfs_remove(gpdir);
259 debugfs_remove(hierdir); 320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
260 debugfs_remove(rcudir); 322 debugfs_remove(rcudir);
261} 323}
262 324
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..fcd107a78c5a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
300 * assigned pending owner [which might not have taken the 300 * assigned pending owner [which might not have taken the
301 * lock yet]: 301 * lock yet]:
302 */ 302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock) 303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
304{ 305{
305 struct task_struct *pendowner = rt_mutex_owner(lock); 306 struct task_struct *pendowner = rt_mutex_owner(lock);
306 struct rt_mutex_waiter *next; 307 struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
309 if (!rt_mutex_owner_pending(lock)) 310 if (!rt_mutex_owner_pending(lock))
310 return 0; 311 return 0;
311 312
312 if (pendowner == current) 313 if (pendowner == task)
313 return 1; 314 return 1;
314 315
315 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 spin_lock_irqsave(&pendowner->pi_lock, flags);
316 if (current->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
317 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
318 return 0; 319 return 0;
319 } 320 }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
338 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
339 * enqueued on the pending owners pi_waiters queue. So 340 * enqueued on the pending owners pi_waiters queue. So
340 * we have to enqueue this waiter into 341 * we have to enqueue this waiter into
341 * current->pi_waiters list. This covers the case, 342 * task->pi_waiters list. This covers the case,
342 * where current is boosted because it holds another 343 * where task is boosted because it holds another
343 * lock and gets unboosted because the booster is 344 * lock and gets unboosted because the booster is
344 * interrupted, so we would delay a waiter with higher 345 * interrupted, so we would delay a waiter with higher
345 * priority as current->normal_prio. 346 * priority as task->normal_prio.
346 * 347 *
347 * Note: in the rare case of a SCHED_OTHER task changing 348 * Note: in the rare case of a SCHED_OTHER task changing
348 * its priority and thus stealing the lock, next->task 349 * its priority and thus stealing the lock, next->task
349 * might be current: 350 * might be task:
350 */ 351 */
351 if (likely(next->task != current)) { 352 if (likely(next->task != task)) {
352 spin_lock_irqsave(&current->pi_lock, flags); 353 spin_lock_irqsave(&task->pi_lock, flags);
353 plist_add(&next->pi_list_entry, &current->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
354 __rt_mutex_adjust_prio(current); 355 __rt_mutex_adjust_prio(task);
355 spin_unlock_irqrestore(&current->pi_lock, flags); 356 spin_unlock_irqrestore(&task->pi_lock, flags);
356 } 357 }
357 return 1; 358 return 1;
358} 359}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
389 */ 390 */
390 mark_rt_mutex_waiters(lock); 391 mark_rt_mutex_waiters(lock);
391 392
392 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) 393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
393 return 0; 394 return 0;
394 395
395 /* We got the lock. */ 396 /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
411 */ 412 */
412static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 413static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
413 struct rt_mutex_waiter *waiter, 414 struct rt_mutex_waiter *waiter,
415 struct task_struct *task,
414 int detect_deadlock) 416 int detect_deadlock)
415{ 417{
416 struct task_struct *owner = rt_mutex_owner(lock); 418 struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
418 unsigned long flags; 420 unsigned long flags;
419 int chain_walk = 0, res; 421 int chain_walk = 0, res;
420 422
421 spin_lock_irqsave(&current->pi_lock, flags); 423 spin_lock_irqsave(&task->pi_lock, flags);
422 __rt_mutex_adjust_prio(current); 424 __rt_mutex_adjust_prio(task);
423 waiter->task = current; 425 waiter->task = task;
424 waiter->lock = lock; 426 waiter->lock = lock;
425 plist_node_init(&waiter->list_entry, current->prio); 427 plist_node_init(&waiter->list_entry, task->prio);
426 plist_node_init(&waiter->pi_list_entry, current->prio); 428 plist_node_init(&waiter->pi_list_entry, task->prio);
427 429
428 /* Get the top priority waiter on the lock */ 430 /* Get the top priority waiter on the lock */
429 if (rt_mutex_has_waiters(lock)) 431 if (rt_mutex_has_waiters(lock))
430 top_waiter = rt_mutex_top_waiter(lock); 432 top_waiter = rt_mutex_top_waiter(lock);
431 plist_add(&waiter->list_entry, &lock->wait_list); 433 plist_add(&waiter->list_entry, &lock->wait_list);
432 434
433 current->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
434 436
435 spin_unlock_irqrestore(&current->pi_lock, flags); 437 spin_unlock_irqrestore(&task->pi_lock, flags);
436 438
437 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
438 spin_lock_irqsave(&owner->pi_lock, flags); 440 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
460 spin_unlock(&lock->wait_lock); 462 spin_unlock(&lock->wait_lock);
461 463
462 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
463 current); 465 task);
464 466
465 spin_lock(&lock->wait_lock); 467 spin_lock(&lock->wait_lock);
466 468
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
605 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 607 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
606} 608}
607 609
608/* 610/**
609 * Slow path lock function: 611 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
612 * @lock: the rt_mutex to take
613 * @state: the state the task should block in (TASK_INTERRUPTIBLE
614 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 *
619 * lock->wait_lock must be held by the caller.
610 */ 620 */
611static int __sched 621static int __sched
612rt_mutex_slowlock(struct rt_mutex *lock, int state, 622__rt_mutex_slowlock(struct rt_mutex *lock, int state,
613 struct hrtimer_sleeper *timeout, 623 struct hrtimer_sleeper *timeout,
614 int detect_deadlock) 624 struct rt_mutex_waiter *waiter,
625 int detect_deadlock)
615{ 626{
616 struct rt_mutex_waiter waiter;
617 int ret = 0; 627 int ret = 0;
618 628
619 debug_rt_mutex_init_waiter(&waiter);
620 waiter.task = NULL;
621
622 spin_lock(&lock->wait_lock);
623
624 /* Try to acquire the lock again: */
625 if (try_to_take_rt_mutex(lock)) {
626 spin_unlock(&lock->wait_lock);
627 return 0;
628 }
629
630 set_current_state(state);
631
632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) {
634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 if (!hrtimer_active(&timeout->timer))
636 timeout->task = NULL;
637 }
638
639 for (;;) { 629 for (;;) {
640 /* Try to acquire the lock: */ 630 /* Try to acquire the lock: */
641 if (try_to_take_rt_mutex(lock)) 631 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
656 } 646 }
657 647
658 /* 648 /*
659 * waiter.task is NULL the first time we come here and 649 * waiter->task is NULL the first time we come here and
660 * when we have been woken up by the previous owner 650 * when we have been woken up by the previous owner
661 * but the lock got stolen by a higher prio task. 651 * but the lock got stolen by a higher prio task.
662 */ 652 */
663 if (!waiter.task) { 653 if (!waiter->task) {
664 ret = task_blocks_on_rt_mutex(lock, &waiter, 654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
665 detect_deadlock); 655 detect_deadlock);
666 /* 656 /*
667 * If we got woken up by the owner then start loop 657 * If we got woken up by the owner then start loop
668 * all over without going into schedule to try 658 * all over without going into schedule to try
669 * to get the lock now: 659 * to get the lock now:
670 */ 660 */
671 if (unlikely(!waiter.task)) { 661 if (unlikely(!waiter->task)) {
672 /* 662 /*
673 * Reset the return value. We might 663 * Reset the return value. We might
674 * have returned with -EDEADLK and the 664 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
684 674
685 spin_unlock(&lock->wait_lock); 675 spin_unlock(&lock->wait_lock);
686 676
687 debug_rt_mutex_print_deadlock(&waiter); 677 debug_rt_mutex_print_deadlock(waiter);
688 678
689 if (waiter.task) 679 if (waiter->task)
690 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
691 681
692 spin_lock(&lock->wait_lock); 682 spin_lock(&lock->wait_lock);
693 set_current_state(state); 683 set_current_state(state);
694 } 684 }
695 685
686 return ret;
687}
688
689/*
690 * Slow path lock function:
691 */
692static int __sched
693rt_mutex_slowlock(struct rt_mutex *lock, int state,
694 struct hrtimer_sleeper *timeout,
695 int detect_deadlock)
696{
697 struct rt_mutex_waiter waiter;
698 int ret = 0;
699
700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702
703 spin_lock(&lock->wait_lock);
704
705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock);
708 return 0;
709 }
710
711 set_current_state(state);
712
713 /* Setup the timer, when timeout != NULL */
714 if (unlikely(timeout)) {
715 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
716 if (!hrtimer_active(&timeout->timer))
717 timeout->task = NULL;
718 }
719
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
721 detect_deadlock);
722
696 set_current_state(TASK_RUNNING); 723 set_current_state(TASK_RUNNING);
697 724
698 if (unlikely(waiter.task)) 725 if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 891EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 892
866/** 893/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 894 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 895 * the timeout structure is provided
869 * by the caller 896 * by the caller
870 * 897 *
871 * @lock: the rt_mutex to be locked 898 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 899 * @timeout: timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
875 * Returns: 902 * Returns:
876 * 0 on success 903 * 0 on success
877 * -EINTR when interrupted by a signal 904 * -EINTR when interrupted by a signal
878 * -ETIMEOUT when the timeout expired 905 * -ETIMEDOUT when the timeout expired
879 * -EDEADLK when the lock would deadlock (when deadlock detection is on) 906 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
880 */ 907 */
881int 908int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 940}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 941EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 942
916/*** 943/**
917 * rt_mutex_destroy - mark a mutex unusable 944 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 945 * @lock: the mutex to be destroyed
919 * 946 *
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
986} 1013}
987 1014
988/** 1015/**
1016 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1017 * @lock: the rt_mutex to take
1018 * @waiter: the pre-initialized rt_mutex_waiter
1019 * @task: the task to prepare
1020 * @detect_deadlock: perform deadlock detection (1) or not (0)
1021 *
1022 * Returns:
1023 * 0 - task blocked on lock
1024 * 1 - acquired the lock for task, caller should wake it up
1025 * <0 - error
1026 *
1027 * Special API call for FUTEX_REQUEUE_PI support.
1028 */
1029int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1030 struct rt_mutex_waiter *waiter,
1031 struct task_struct *task, int detect_deadlock)
1032{
1033 int ret;
1034
1035 spin_lock(&lock->wait_lock);
1036
1037 mark_rt_mutex_waiters(lock);
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0);
1044
1045 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1;
1047 }
1048
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050
1051
1052 if (ret && !waiter->task) {
1053 /*
1054 * Reset the return value. We might have
1055 * returned with -EDEADLK and the owner
1056 * released the lock while we were walking the
1057 * pi chain. Let the waiter sort it out.
1058 */
1059 ret = 0;
1060 }
1061 spin_unlock(&lock->wait_lock);
1062
1063 debug_rt_mutex_print_deadlock(waiter);
1064
1065 return ret;
1066}
1067
1068/**
989 * rt_mutex_next_owner - return the next owner of the lock 1069 * rt_mutex_next_owner - return the next owner of the lock
990 * 1070 *
991 * @lock: the rt lock query 1071 * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1004 1084
1005 return rt_mutex_top_waiter(lock)->task; 1085 return rt_mutex_top_waiter(lock)->task;
1006} 1086}
1087
1088/**
1089 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1090 * @lock: the rt_mutex we were woken on
1091 * @to: the timeout, null if none. hrtimer should already have
1092 * been started.
1093 * @waiter: the pre-initialized rt_mutex_waiter
1094 * @detect_deadlock: perform deadlock detection (1) or not (0)
1095 *
1096 * Complete the lock acquisition started our behalf by another thread.
1097 *
1098 * Returns:
1099 * 0 - success
1100 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
1101 *
1102 * Special API call for PI-futex requeue support
1103 */
1104int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1105 struct hrtimer_sleeper *to,
1106 struct rt_mutex_waiter *waiter,
1107 int detect_deadlock)
1108{
1109 int ret;
1110
1111 spin_lock(&lock->wait_lock);
1112
1113 set_current_state(TASK_INTERRUPTIBLE);
1114
1115 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
1116 detect_deadlock);
1117
1118 set_current_state(TASK_RUNNING);
1119
1120 if (unlikely(waiter->task))
1121 remove_waiter(lock, waiter);
1122
1123 /*
1124 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1125 * have to fix that up.
1126 */
1127 fixup_rt_mutex_waiters(lock);
1128
1129 spin_unlock(&lock->wait_lock);
1130
1131 /*
1132 * Readjust priority, when we did not get the lock. We might have been
1133 * the pending owner and boosted. Since we did not take the lock, the
1134 * PI boost has to go.
1135 */
1136 if (unlikely(ret))
1137 rt_mutex_adjust_prio(current);
1138
1139 return ret;
1140}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
124 struct rt_mutex_waiter *waiter,
125 struct task_struct *task,
126 int detect_deadlock);
127extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
128 struct hrtimer_sleeper *to,
129 struct rt_mutex_waiter *waiter,
130 int detect_deadlock);
123 131
124#ifdef CONFIG_DEBUG_RT_MUTEXES 132#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h" 133# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 6530a27052f3..8fb88a906aaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,7 +69,6 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
@@ -240,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
240 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
241 delta = ktime_to_ns(ktime_sub(hard, soft)); 241 delta = ktime_to_ns(ktime_sub(hard, soft));
242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
243 HRTIMER_MODE_ABS, 0); 243 HRTIMER_MODE_ABS_PINNED, 0);
244 } 244 }
245 spin_unlock(&rt_b->rt_runtime_lock); 245 spin_unlock(&rt_b->rt_runtime_lock);
246} 246}
@@ -580,6 +580,7 @@ struct rq {
580 struct load_weight load; 580 struct load_weight load;
581 unsigned long nr_load_updates; 581 unsigned long nr_load_updates;
582 u64 nr_switches; 582 u64 nr_switches;
583 u64 nr_migrations_in;
583 584
584 struct cfs_rq cfs; 585 struct cfs_rq cfs;
585 struct rt_rq rt; 586 struct rt_rq rt;
@@ -626,6 +627,10 @@ struct rq {
626 struct list_head migration_queue; 627 struct list_head migration_queue;
627#endif 628#endif
628 629
630 /* calc_load related fields */
631 unsigned long calc_load_update;
632 long calc_load_active;
633
629#ifdef CONFIG_SCHED_HRTICK 634#ifdef CONFIG_SCHED_HRTICK
630#ifdef CONFIG_SMP 635#ifdef CONFIG_SMP
631 int hrtick_csd_pending; 636 int hrtick_csd_pending;
@@ -688,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
688#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
689#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
690 695
691static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
692{ 697{
693 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
694} 699}
@@ -1150,7 +1155,7 @@ static __init void init_hrtick(void)
1150static void hrtick_start(struct rq *rq, u64 delay) 1155static void hrtick_start(struct rq *rq, u64 delay)
1151{ 1156{
1152 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1157 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1153 HRTIMER_MODE_REL, 0); 1158 HRTIMER_MODE_REL_PINNED, 0);
1154} 1159}
1155 1160
1156static inline void init_hrtick(void) 1161static inline void init_hrtick(void)
@@ -1724,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1724} 1729}
1725#endif 1730#endif
1726 1731
1732static void calc_load_account_active(struct rq *this_rq);
1733
1727#include "sched_stats.h" 1734#include "sched_stats.h"
1728#include "sched_idletask.c" 1735#include "sched_idletask.c"
1729#include "sched_fair.c" 1736#include "sched_fair.c"
@@ -1963,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1963 p->se.sleep_start -= clock_offset; 1970 p->se.sleep_start -= clock_offset;
1964 if (p->se.block_start) 1971 if (p->se.block_start)
1965 p->se.block_start -= clock_offset; 1972 p->se.block_start -= clock_offset;
1973#endif
1966 if (old_cpu != new_cpu) { 1974 if (old_cpu != new_cpu) {
1967 schedstat_inc(p, se.nr_migrations); 1975 p->se.nr_migrations++;
1976 new_rq->nr_migrations_in++;
1977#ifdef CONFIG_SCHEDSTATS
1968 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1969 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1970 }
1971#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu);
1982 }
1972 p->se.vruntime -= old_cfsrq->min_vruntime - 1983 p->se.vruntime -= old_cfsrq->min_vruntime -
1973 new_cfsrq->min_vruntime; 1984 new_cfsrq->min_vruntime;
1974 1985
@@ -2181,6 +2192,7 @@ void kick_process(struct task_struct *p)
2181 smp_send_reschedule(cpu); 2192 smp_send_reschedule(cpu);
2182 preempt_enable(); 2193 preempt_enable();
2183} 2194}
2195EXPORT_SYMBOL_GPL(kick_process);
2184 2196
2185/* 2197/*
2186 * Return a low guess at the load of a migration-source cpu weighted 2198 * Return a low guess at the load of a migration-source cpu weighted
@@ -2363,6 +2375,27 @@ static int sched_balance_self(int cpu, int flag)
2363 2375
2364#endif /* CONFIG_SMP */ 2376#endif /* CONFIG_SMP */
2365 2377
2378/**
2379 * task_oncpu_function_call - call a function on the cpu on which a task runs
2380 * @p: the task to evaluate
2381 * @func: the function to be called
2382 * @info: the function call argument
2383 *
2384 * Calls the function @func when the task is currently running. This might
2385 * be on the current CPU, which just calls the function directly
2386 */
2387void task_oncpu_function_call(struct task_struct *p,
2388 void (*func) (void *info), void *info)
2389{
2390 int cpu;
2391
2392 preempt_disable();
2393 cpu = task_cpu(p);
2394 if (task_curr(p))
2395 smp_call_function_single(cpu, func, info, 1);
2396 preempt_enable();
2397}
2398
2366/*** 2399/***
2367 * try_to_wake_up - wake up a thread 2400 * try_to_wake_up - wake up a thread
2368 * @p: the to-be-woken-up thread 2401 * @p: the to-be-woken-up thread
@@ -2497,6 +2530,17 @@ out:
2497 return success; 2530 return success;
2498} 2531}
2499 2532
2533/**
2534 * wake_up_process - Wake up a specific process
2535 * @p: The process to be woken up.
2536 *
2537 * Attempt to wake up the nominated process and move it to the set of runnable
2538 * processes. Returns 1 if the process was woken up, 0 if it was already
2539 * running.
2540 *
2541 * It may be assumed that this function implies a write memory barrier before
2542 * changing the task state if and only if any tasks are woken up.
2543 */
2500int wake_up_process(struct task_struct *p) 2544int wake_up_process(struct task_struct *p)
2501{ 2545{
2502 return try_to_wake_up(p, TASK_ALL, 0); 2546 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2519,6 +2563,7 @@ static void __sched_fork(struct task_struct *p)
2519 p->se.exec_start = 0; 2563 p->se.exec_start = 0;
2520 p->se.sum_exec_runtime = 0; 2564 p->se.sum_exec_runtime = 0;
2521 p->se.prev_sum_exec_runtime = 0; 2565 p->se.prev_sum_exec_runtime = 0;
2566 p->se.nr_migrations = 0;
2522 p->se.last_wakeup = 0; 2567 p->se.last_wakeup = 0;
2523 p->se.avg_overlap = 0; 2568 p->se.avg_overlap = 0;
2524 p->se.start_runtime = 0; 2569 p->se.start_runtime = 0;
@@ -2749,6 +2794,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2749 */ 2794 */
2750 prev_state = prev->state; 2795 prev_state = prev->state;
2751 finish_arch_switch(prev); 2796 finish_arch_switch(prev);
2797 perf_counter_task_sched_in(current, cpu_of(rq));
2752 finish_lock_switch(rq, prev); 2798 finish_lock_switch(rq, prev);
2753#ifdef CONFIG_SMP 2799#ifdef CONFIG_SMP
2754 if (post_schedule) 2800 if (post_schedule)
@@ -2805,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2805 * combine the page table reload and the switch backend into 2851 * combine the page table reload and the switch backend into
2806 * one hypercall. 2852 * one hypercall.
2807 */ 2853 */
2808 arch_enter_lazy_cpu_mode(); 2854 arch_start_context_switch(prev);
2809 2855
2810 if (unlikely(!mm)) { 2856 if (unlikely(!mm)) {
2811 next->active_mm = oldmm; 2857 next->active_mm = oldmm;
@@ -2895,19 +2941,81 @@ unsigned long nr_iowait(void)
2895 return sum; 2941 return sum;
2896} 2942}
2897 2943
2898unsigned long nr_active(void) 2944/* Variables and functions for calc_load */
2945static atomic_long_t calc_load_tasks;
2946static unsigned long calc_load_update;
2947unsigned long avenrun[3];
2948EXPORT_SYMBOL(avenrun);
2949
2950/**
2951 * get_avenrun - get the load average array
2952 * @loads: pointer to dest load array
2953 * @offset: offset to add
2954 * @shift: shift count to shift the result left
2955 *
2956 * These values are estimates at best, so no need for locking.
2957 */
2958void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2899{ 2959{
2900 unsigned long i, running = 0, uninterruptible = 0; 2960 loads[0] = (avenrun[0] + offset) << shift;
2961 loads[1] = (avenrun[1] + offset) << shift;
2962 loads[2] = (avenrun[2] + offset) << shift;
2963}
2901 2964
2902 for_each_online_cpu(i) { 2965static unsigned long
2903 running += cpu_rq(i)->nr_running; 2966calc_load(unsigned long load, unsigned long exp, unsigned long active)
2904 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2967{
2905 } 2968 load *= exp;
2969 load += active * (FIXED_1 - exp);
2970 return load >> FSHIFT;
2971}
2972
2973/*
2974 * calc_load - update the avenrun load estimates 10 ticks after the
2975 * CPUs have updated calc_load_tasks.
2976 */
2977void calc_global_load(void)
2978{
2979 unsigned long upd = calc_load_update + 10;
2980 long active;
2981
2982 if (time_before(jiffies, upd))
2983 return;
2984
2985 active = atomic_long_read(&calc_load_tasks);
2986 active = active > 0 ? active * FIXED_1 : 0;
2906 2987
2907 if (unlikely((long)uninterruptible < 0)) 2988 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2908 uninterruptible = 0; 2989 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2990 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2909 2991
2910 return running + uninterruptible; 2992 calc_load_update += LOAD_FREQ;
2993}
2994
2995/*
2996 * Either called from update_cpu_load() or from a cpu going idle
2997 */
2998static void calc_load_account_active(struct rq *this_rq)
2999{
3000 long nr_active, delta;
3001
3002 nr_active = this_rq->nr_running;
3003 nr_active += (long) this_rq->nr_uninterruptible;
3004
3005 if (nr_active != this_rq->calc_load_active) {
3006 delta = nr_active - this_rq->calc_load_active;
3007 this_rq->calc_load_active = nr_active;
3008 atomic_long_add(delta, &calc_load_tasks);
3009 }
3010}
3011
3012/*
3013 * Externally visible per-cpu scheduler statistics:
3014 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3015 */
3016u64 cpu_nr_migrations(int cpu)
3017{
3018 return cpu_rq(cpu)->nr_migrations_in;
2911} 3019}
2912 3020
2913/* 3021/*
@@ -2938,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)
2938 new_load += scale-1; 3046 new_load += scale-1;
2939 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3047 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2940 } 3048 }
3049
3050 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3051 this_rq->calc_load_update += LOAD_FREQ;
3052 calc_load_account_active(this_rq);
3053 }
2941} 3054}
2942 3055
2943#ifdef CONFIG_SMP 3056#ifdef CONFIG_SMP
@@ -4279,10 +4392,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4279static struct { 4392static struct {
4280 atomic_t load_balancer; 4393 atomic_t load_balancer;
4281 cpumask_var_t cpu_mask; 4394 cpumask_var_t cpu_mask;
4395 cpumask_var_t ilb_grp_nohz_mask;
4282} nohz ____cacheline_aligned = { 4396} nohz ____cacheline_aligned = {
4283 .load_balancer = ATOMIC_INIT(-1), 4397 .load_balancer = ATOMIC_INIT(-1),
4284}; 4398};
4285 4399
4400int get_nohz_load_balancer(void)
4401{
4402 return atomic_read(&nohz.load_balancer);
4403}
4404
4405#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4406/**
4407 * lowest_flag_domain - Return lowest sched_domain containing flag.
4408 * @cpu: The cpu whose lowest level of sched domain is to
4409 * be returned.
4410 * @flag: The flag to check for the lowest sched_domain
4411 * for the given cpu.
4412 *
4413 * Returns the lowest sched_domain of a cpu which contains the given flag.
4414 */
4415static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4416{
4417 struct sched_domain *sd;
4418
4419 for_each_domain(cpu, sd)
4420 if (sd && (sd->flags & flag))
4421 break;
4422
4423 return sd;
4424}
4425
4426/**
4427 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4428 * @cpu: The cpu whose domains we're iterating over.
4429 * @sd: variable holding the value of the power_savings_sd
4430 * for cpu.
4431 * @flag: The flag to filter the sched_domains to be iterated.
4432 *
4433 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4434 * set, starting from the lowest sched_domain to the highest.
4435 */
4436#define for_each_flag_domain(cpu, sd, flag) \
4437 for (sd = lowest_flag_domain(cpu, flag); \
4438 (sd && (sd->flags & flag)); sd = sd->parent)
4439
4440/**
4441 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4442 * @ilb_group: group to be checked for semi-idleness
4443 *
4444 * Returns: 1 if the group is semi-idle. 0 otherwise.
4445 *
4446 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4447 * and atleast one non-idle CPU. This helper function checks if the given
4448 * sched_group is semi-idle or not.
4449 */
4450static inline int is_semi_idle_group(struct sched_group *ilb_group)
4451{
4452 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4453 sched_group_cpus(ilb_group));
4454
4455 /*
4456 * A sched_group is semi-idle when it has atleast one busy cpu
4457 * and atleast one idle cpu.
4458 */
4459 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4460 return 0;
4461
4462 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4463 return 0;
4464
4465 return 1;
4466}
4467/**
4468 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4469 * @cpu: The cpu which is nominating a new idle_load_balancer.
4470 *
4471 * Returns: Returns the id of the idle load balancer if it exists,
4472 * Else, returns >= nr_cpu_ids.
4473 *
4474 * This algorithm picks the idle load balancer such that it belongs to a
4475 * semi-idle powersavings sched_domain. The idea is to try and avoid
4476 * completely idle packages/cores just for the purpose of idle load balancing
4477 * when there are other idle cpu's which are better suited for that job.
4478 */
4479static int find_new_ilb(int cpu)
4480{
4481 struct sched_domain *sd;
4482 struct sched_group *ilb_group;
4483
4484 /*
4485 * Have idle load balancer selection from semi-idle packages only
4486 * when power-aware load balancing is enabled
4487 */
4488 if (!(sched_smt_power_savings || sched_mc_power_savings))
4489 goto out_done;
4490
4491 /*
4492 * Optimize for the case when we have no idle CPUs or only one
4493 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4494 */
4495 if (cpumask_weight(nohz.cpu_mask) < 2)
4496 goto out_done;
4497
4498 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4499 ilb_group = sd->groups;
4500
4501 do {
4502 if (is_semi_idle_group(ilb_group))
4503 return cpumask_first(nohz.ilb_grp_nohz_mask);
4504
4505 ilb_group = ilb_group->next;
4506
4507 } while (ilb_group != sd->groups);
4508 }
4509
4510out_done:
4511 return cpumask_first(nohz.cpu_mask);
4512}
4513#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4514static inline int find_new_ilb(int call_cpu)
4515{
4516 return cpumask_first(nohz.cpu_mask);
4517}
4518#endif
4519
4286/* 4520/*
4287 * This routine will try to nominate the ilb (idle load balancing) 4521 * This routine will try to nominate the ilb (idle load balancing)
4288 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4522 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4337,8 +4571,24 @@ int select_nohz_load_balancer(int stop_tick)
4337 /* make me the ilb owner */ 4571 /* make me the ilb owner */
4338 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4572 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4339 return 1; 4573 return 1;
4340 } else if (atomic_read(&nohz.load_balancer) == cpu) 4574 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4575 int new_ilb;
4576
4577 if (!(sched_smt_power_savings ||
4578 sched_mc_power_savings))
4579 return 1;
4580 /*
4581 * Check to see if there is a more power-efficient
4582 * ilb.
4583 */
4584 new_ilb = find_new_ilb(cpu);
4585 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4586 atomic_set(&nohz.load_balancer, -1);
4587 resched_cpu(new_ilb);
4588 return 0;
4589 }
4341 return 1; 4590 return 1;
4591 }
4342 } else { 4592 } else {
4343 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4593 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4344 return 0; 4594 return 0;
@@ -4507,15 +4757,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4507 } 4757 }
4508 4758
4509 if (atomic_read(&nohz.load_balancer) == -1) { 4759 if (atomic_read(&nohz.load_balancer) == -1) {
4510 /* 4760 int ilb = find_new_ilb(cpu);
4511 * simple selection for now: Nominate the
4512 * first cpu in the nohz list to be the next
4513 * ilb owner.
4514 *
4515 * TBD: Traverse the sched domains and nominate
4516 * the nearest cpu in the nohz.cpu_mask.
4517 */
4518 int ilb = cpumask_first(nohz.cpu_mask);
4519 4761
4520 if (ilb < nr_cpu_ids) 4762 if (ilb < nr_cpu_ids)
4521 resched_cpu(ilb); 4763 resched_cpu(ilb);
@@ -4879,6 +5121,8 @@ void scheduler_tick(void)
4879 curr->sched_class->task_tick(rq, curr, 0); 5121 curr->sched_class->task_tick(rq, curr, 0);
4880 spin_unlock(&rq->lock); 5122 spin_unlock(&rq->lock);
4881 5123
5124 perf_counter_task_tick(curr, cpu);
5125
4882#ifdef CONFIG_SMP 5126#ifdef CONFIG_SMP
4883 rq->idle_at_tick = idle_cpu(cpu); 5127 rq->idle_at_tick = idle_cpu(cpu);
4884 trigger_load_balance(rq, cpu); 5128 trigger_load_balance(rq, cpu);
@@ -5046,13 +5290,15 @@ pick_next_task(struct rq *rq)
5046/* 5290/*
5047 * schedule() is the main scheduler function. 5291 * schedule() is the main scheduler function.
5048 */ 5292 */
5049asmlinkage void __sched __schedule(void) 5293asmlinkage void __sched schedule(void)
5050{ 5294{
5051 struct task_struct *prev, *next; 5295 struct task_struct *prev, *next;
5052 unsigned long *switch_count; 5296 unsigned long *switch_count;
5053 struct rq *rq; 5297 struct rq *rq;
5054 int cpu; 5298 int cpu;
5055 5299
5300need_resched:
5301 preempt_disable();
5056 cpu = smp_processor_id(); 5302 cpu = smp_processor_id();
5057 rq = cpu_rq(cpu); 5303 rq = cpu_rq(cpu);
5058 rcu_qsctr_inc(cpu); 5304 rcu_qsctr_inc(cpu);
@@ -5092,6 +5338,7 @@ need_resched_nonpreemptible:
5092 5338
5093 if (likely(prev != next)) { 5339 if (likely(prev != next)) {
5094 sched_info_switch(prev, next); 5340 sched_info_switch(prev, next);
5341 perf_counter_task_sched_out(prev, next, cpu);
5095 5342
5096 rq->nr_switches++; 5343 rq->nr_switches++;
5097 rq->curr = next; 5344 rq->curr = next;
@@ -5109,15 +5356,9 @@ need_resched_nonpreemptible:
5109 5356
5110 if (unlikely(reacquire_kernel_lock(current) < 0)) 5357 if (unlikely(reacquire_kernel_lock(current) < 0))
5111 goto need_resched_nonpreemptible; 5358 goto need_resched_nonpreemptible;
5112}
5113 5359
5114asmlinkage void __sched schedule(void)
5115{
5116need_resched:
5117 preempt_disable();
5118 __schedule();
5119 preempt_enable_no_resched(); 5360 preempt_enable_no_resched();
5120 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5361 if (need_resched())
5121 goto need_resched; 5362 goto need_resched;
5122} 5363}
5123EXPORT_SYMBOL(schedule); 5364EXPORT_SYMBOL(schedule);
@@ -5260,7 +5501,7 @@ EXPORT_SYMBOL(default_wake_function);
5260 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5501 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5261 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5502 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5262 */ 5503 */
5263void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5504static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5264 int nr_exclusive, int sync, void *key) 5505 int nr_exclusive, int sync, void *key)
5265{ 5506{
5266 wait_queue_t *curr, *next; 5507 wait_queue_t *curr, *next;
@@ -5280,6 +5521,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5280 * @mode: which threads 5521 * @mode: which threads
5281 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5522 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5282 * @key: is directly passed to the wakeup function 5523 * @key: is directly passed to the wakeup function
5524 *
5525 * It may be assumed that this function implies a write memory barrier before
5526 * changing the task state if and only if any tasks are woken up.
5283 */ 5527 */
5284void __wake_up(wait_queue_head_t *q, unsigned int mode, 5528void __wake_up(wait_queue_head_t *q, unsigned int mode,
5285 int nr_exclusive, void *key) 5529 int nr_exclusive, void *key)
@@ -5318,6 +5562,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5318 * with each other. This can prevent needless bouncing between CPUs. 5562 * with each other. This can prevent needless bouncing between CPUs.
5319 * 5563 *
5320 * On UP it can prevent extra preemption. 5564 * On UP it can prevent extra preemption.
5565 *
5566 * It may be assumed that this function implies a write memory barrier before
5567 * changing the task state if and only if any tasks are woken up.
5321 */ 5568 */
5322void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5569void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5323 int nr_exclusive, void *key) 5570 int nr_exclusive, void *key)
@@ -5354,6 +5601,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5354 * awakened in the same order in which they were queued. 5601 * awakened in the same order in which they were queued.
5355 * 5602 *
5356 * See also complete_all(), wait_for_completion() and related routines. 5603 * See also complete_all(), wait_for_completion() and related routines.
5604 *
5605 * It may be assumed that this function implies a write memory barrier before
5606 * changing the task state if and only if any tasks are woken up.
5357 */ 5607 */
5358void complete(struct completion *x) 5608void complete(struct completion *x)
5359{ 5609{
@@ -5371,6 +5621,9 @@ EXPORT_SYMBOL(complete);
5371 * @x: holds the state of this particular completion 5621 * @x: holds the state of this particular completion
5372 * 5622 *
5373 * This will wake up all threads waiting on this particular completion event. 5623 * This will wake up all threads waiting on this particular completion event.
5624 *
5625 * It may be assumed that this function implies a write memory barrier before
5626 * changing the task state if and only if any tasks are woken up.
5374 */ 5627 */
5375void complete_all(struct completion *x) 5628void complete_all(struct completion *x)
5376{ 5629{
@@ -6529,8 +6782,9 @@ void sched_show_task(struct task_struct *p)
6529#ifdef CONFIG_DEBUG_STACK_USAGE 6782#ifdef CONFIG_DEBUG_STACK_USAGE
6530 free = stack_not_used(p); 6783 free = stack_not_used(p);
6531#endif 6784#endif
6532 printk(KERN_CONT "%5lu %5d %6d\n", free, 6785 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6533 task_pid_nr(p), task_pid_nr(p->real_parent)); 6786 task_pid_nr(p), task_pid_nr(p->real_parent),
6787 (unsigned long)task_thread_info(p)->flags);
6534 6788
6535 show_stack(p, NULL); 6789 show_stack(p, NULL);
6536} 6790}
@@ -7009,6 +7263,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7009 7263
7010 } 7264 }
7011} 7265}
7266
7267/*
7268 * remove the tasks which were accounted by rq from calc_load_tasks.
7269 */
7270static void calc_global_load_remove(struct rq *rq)
7271{
7272 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7273}
7012#endif /* CONFIG_HOTPLUG_CPU */ 7274#endif /* CONFIG_HOTPLUG_CPU */
7013 7275
7014#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7276#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7243,6 +7505,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 /* Update our root-domain */ 7505 /* Update our root-domain */
7244 rq = cpu_rq(cpu); 7506 rq = cpu_rq(cpu);
7245 spin_lock_irqsave(&rq->lock, flags); 7507 spin_lock_irqsave(&rq->lock, flags);
7508 rq->calc_load_update = calc_load_update;
7509 rq->calc_load_active = 0;
7246 if (rq->rd) { 7510 if (rq->rd) {
7247 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7511 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7248 7512
@@ -7282,7 +7546,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7282 cpuset_unlock(); 7546 cpuset_unlock();
7283 migrate_nr_uninterruptible(rq); 7547 migrate_nr_uninterruptible(rq);
7284 BUG_ON(rq->nr_running != 0); 7548 BUG_ON(rq->nr_running != 0);
7285 7549 calc_global_load_remove(rq);
7286 /* 7550 /*
7287 * No need to migrate the tasks: it was best-effort if 7551 * No need to migrate the tasks: it was best-effort if
7288 * they didn't take sched_hotcpu_mutex. Just wake up 7552 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7318,8 +7582,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7318 return NOTIFY_OK; 7582 return NOTIFY_OK;
7319} 7583}
7320 7584
7321/* Register at highest priority so that task migration (migrate_all_tasks) 7585/*
7322 * happens before everything else. 7586 * Register at high priority so that task migration (migrate_all_tasks)
7587 * happens before everything else. This has to be lower priority than
7588 * the notifier in the perf_counter subsystem, though.
7323 */ 7589 */
7324static struct notifier_block __cpuinitdata migration_notifier = { 7590static struct notifier_block __cpuinitdata migration_notifier = {
7325 .notifier_call = migration_call, 7591 .notifier_call = migration_call,
@@ -7564,24 +7830,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7564 7830
7565static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7831static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
7566{ 7832{
7833 gfp_t gfp = GFP_KERNEL;
7834
7567 memset(rd, 0, sizeof(*rd)); 7835 memset(rd, 0, sizeof(*rd));
7568 7836
7569 if (bootmem) { 7837 if (bootmem)
7570 alloc_bootmem_cpumask_var(&def_root_domain.span); 7838 gfp = GFP_NOWAIT;
7571 alloc_bootmem_cpumask_var(&def_root_domain.online);
7572 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7573 cpupri_init(&rd->cpupri, true);
7574 return 0;
7575 }
7576 7839
7577 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7840 if (!alloc_cpumask_var(&rd->span, gfp))
7578 goto out; 7841 goto out;
7579 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7842 if (!alloc_cpumask_var(&rd->online, gfp))
7580 goto free_span; 7843 goto free_span;
7581 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7844 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7582 goto free_online; 7845 goto free_online;
7583 7846
7584 if (cpupri_init(&rd->cpupri, false) != 0) 7847 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7585 goto free_rto_mask; 7848 goto free_rto_mask;
7586 return 0; 7849 return 0;
7587 7850
@@ -7792,8 +8055,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7792 8055
7793/* 8056/*
7794 * The cpus mask in sched_group and sched_domain hangs off the end. 8057 * The cpus mask in sched_group and sched_domain hangs off the end.
7795 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8058 *
7796 * for nr_cpu_ids < CONFIG_NR_CPUS. 8059 * ( See the the comments in include/linux/sched.h:struct sched_group
8060 * and struct sched_domain. )
7797 */ 8061 */
7798struct static_sched_group { 8062struct static_sched_group {
7799 struct sched_group sg; 8063 struct sched_group sg;
@@ -7914,7 +8178,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7914 struct sched_domain *sd; 8178 struct sched_domain *sd;
7915 8179
7916 sd = &per_cpu(phys_domains, j).sd; 8180 sd = &per_cpu(phys_domains, j).sd;
7917 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8181 if (j != group_first_cpu(sd->groups)) {
7918 /* 8182 /*
7919 * Only add "power" once for each 8183 * Only add "power" once for each
7920 * physical package. 8184 * physical package.
@@ -7992,7 +8256,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7992 8256
7993 WARN_ON(!sd || !sd->groups); 8257 WARN_ON(!sd || !sd->groups);
7994 8258
7995 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8259 if (cpu != group_first_cpu(sd->groups))
7996 return; 8260 return;
7997 8261
7998 child = sd->child; 8262 child = sd->child;
@@ -8770,6 +9034,8 @@ void __init sched_init_smp(void)
8770} 9034}
8771#endif /* CONFIG_SMP */ 9035#endif /* CONFIG_SMP */
8772 9036
9037const_debug unsigned int sysctl_timer_migration = 1;
9038
8773int in_sched_functions(unsigned long addr) 9039int in_sched_functions(unsigned long addr)
8774{ 9040{
8775 return in_lock_functions(addr) || 9041 return in_lock_functions(addr) ||
@@ -8904,7 +9170,7 @@ void __init sched_init(void)
8904 * we use alloc_bootmem(). 9170 * we use alloc_bootmem().
8905 */ 9171 */
8906 if (alloc_size) { 9172 if (alloc_size) {
8907 ptr = (unsigned long)alloc_bootmem(alloc_size); 9173 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8908 9174
8909#ifdef CONFIG_FAIR_GROUP_SCHED 9175#ifdef CONFIG_FAIR_GROUP_SCHED
8910 init_task_group.se = (struct sched_entity **)ptr; 9176 init_task_group.se = (struct sched_entity **)ptr;
@@ -8977,6 +9243,8 @@ void __init sched_init(void)
8977 rq = cpu_rq(i); 9243 rq = cpu_rq(i);
8978 spin_lock_init(&rq->lock); 9244 spin_lock_init(&rq->lock);
8979 rq->nr_running = 0; 9245 rq->nr_running = 0;
9246 rq->calc_load_active = 0;
9247 rq->calc_load_update = jiffies + LOAD_FREQ;
8980 init_cfs_rq(&rq->cfs, rq); 9248 init_cfs_rq(&rq->cfs, rq);
8981 init_rt_rq(&rq->rt, rq); 9249 init_rt_rq(&rq->rt, rq);
8982#ifdef CONFIG_FAIR_GROUP_SCHED 9250#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8997,7 +9265,7 @@ void __init sched_init(void)
8997 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9265 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8998 * then A0's share of the cpu resource is: 9266 * then A0's share of the cpu resource is:
8999 * 9267 *
9000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9268 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
9001 * 9269 *
9002 * We achieve this by letting init_task_group's tasks sit 9270 * We achieve this by letting init_task_group's tasks sit
9003 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9271 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9084,20 +9352,26 @@ void __init sched_init(void)
9084 * when this runqueue becomes "idle". 9352 * when this runqueue becomes "idle".
9085 */ 9353 */
9086 init_idle(current, smp_processor_id()); 9354 init_idle(current, smp_processor_id());
9355
9356 calc_load_update = jiffies + LOAD_FREQ;
9357
9087 /* 9358 /*
9088 * During early bootup we pretend to be a normal task: 9359 * During early bootup we pretend to be a normal task:
9089 */ 9360 */
9090 current->sched_class = &fair_sched_class; 9361 current->sched_class = &fair_sched_class;
9091 9362
9092 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9363 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9093 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9364 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9094#ifdef CONFIG_SMP 9365#ifdef CONFIG_SMP
9095#ifdef CONFIG_NO_HZ 9366#ifdef CONFIG_NO_HZ
9096 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9367 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9368 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9097#endif 9369#endif
9098 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9370 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9099#endif /* SMP */ 9371#endif /* SMP */
9100 9372
9373 perf_counter_init();
9374
9101 scheduler_running = 1; 9375 scheduler_running = 1;
9102} 9376}
9103 9377
@@ -9839,6 +10113,13 @@ static int sched_rt_global_constraints(void)
9839 if (sysctl_sched_rt_period <= 0) 10113 if (sysctl_sched_rt_period <= 0)
9840 return -EINVAL; 10114 return -EINVAL;
9841 10115
10116 /*
10117 * There's always some RT tasks in the root group
10118 * -- migration, kstopmachine etc..
10119 */
10120 if (sysctl_sched_rt_runtime == 0)
10121 return -EBUSY;
10122
9842 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10123 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9843 for_each_possible_cpu(i) { 10124 for_each_possible_cpu(i) {
9844 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10125 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 819f17ac796e..e1d16c9a7680 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -38,7 +38,8 @@
38 */ 38 */
39unsigned long long __attribute__((weak)) sched_clock(void) 39unsigned long long __attribute__((weak)) sched_clock(void)
40{ 40{
41 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ);
42} 43}
43 44
44static __read_mostly int sched_clock_running; 45static __read_mostly int sched_clock_running;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89574cd..7deffc9f0e5f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL;
157 int i; 158 int i;
158 159
160 if (bootmem)
161 gfp = GFP_NOWAIT;
162
159 memset(cp, 0, sizeof(*cp)); 163 memset(cp, 0, sizeof(*cp));
160 164
161 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 165 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
163 167
164 spin_lock_init(&vec->lock); 168 spin_lock_init(&vec->lock);
165 vec->count = 0; 169 vec->count = 0;
166 if (bootmem) 170 if (!zalloc_cpumask_var(&vec->mask, gfp))
167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 171 goto cleanup;
170 } 172 }
171 173
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1487
1488 find_matching_se(&se, &pse); 1488 find_matching_se(&se, &pse);
1489 1489
1490 while (se) { 1490 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1491
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1492 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1493 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1494}
1502 1495
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1496static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f2c66f8f9712..9bf0d2a73045 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)
1591 unsigned int i; 1591 unsigned int i;
1592 1592
1593 for_each_possible_cpu(i) 1593 for_each_possible_cpu(i)
1594 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1594 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1595 GFP_KERNEL, cpu_to_node(i)); 1595 GFP_KERNEL, cpu_to_node(i));
1596} 1596}
1597#endif /* CONFIG_SMP */ 1597#endif /* CONFIG_SMP */
diff --git a/kernel/signal.c b/kernel/signal.c
index 94ec0a4dde0f..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -247,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
247/* 247/*
248 * Flush all pending signals for a task. 248 * Flush all pending signals for a task.
249 */ 249 */
250void __flush_signals(struct task_struct *t)
251{
252 clear_tsk_thread_flag(t, TIF_SIGPENDING);
253 flush_sigqueue(&t->pending);
254 flush_sigqueue(&t->signal->shared_pending);
255}
256
250void flush_signals(struct task_struct *t) 257void flush_signals(struct task_struct *t)
251{ 258{
252 unsigned long flags; 259 unsigned long flags;
253 260
254 spin_lock_irqsave(&t->sighand->siglock, flags); 261 spin_lock_irqsave(&t->sighand->siglock, flags);
255 clear_tsk_thread_flag(t, TIF_SIGPENDING); 262 __flush_signals(t);
256 flush_sigqueue(&t->pending);
257 flush_sigqueue(&t->signal->shared_pending);
258 spin_unlock_irqrestore(&t->sighand->siglock, flags); 263 spin_unlock_irqrestore(&t->sighand->siglock, flags);
259} 264}
260 265
@@ -827,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
827{ 832{
828 struct sigpending *pending; 833 struct sigpending *pending;
829 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
830 836
831 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
832 838
@@ -858,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
858 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
859 pass on the info struct. */ 865 pass on the info struct. */
860 866
861 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
862 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
863 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
864 if (q) { 874 if (q) {
865 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
866 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
@@ -2276,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2276 return kill_something_info(sig, &info, pid); 2286 return kill_something_info(sig, &info, pid);
2277} 2287}
2278 2288
2279static int do_tkill(pid_t tgid, pid_t pid, int sig) 2289static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2280{ 2291{
2281 int error;
2282 struct siginfo info;
2283 struct task_struct *p; 2292 struct task_struct *p;
2284 unsigned long flags; 2293 unsigned long flags;
2285 2294 int error = -ESRCH;
2286 error = -ESRCH;
2287 info.si_signo = sig;
2288 info.si_errno = 0;
2289 info.si_code = SI_TKILL;
2290 info.si_pid = task_tgid_vnr(current);
2291 info.si_uid = current_uid();
2292 2295
2293 rcu_read_lock(); 2296 rcu_read_lock();
2294 p = find_task_by_vpid(pid); 2297 p = find_task_by_vpid(pid);
2295 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2298 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2296 error = check_kill_permission(sig, &info, p); 2299 error = check_kill_permission(sig, info, p);
2297 /* 2300 /*
2298 * The null signal is a permissions and process existence 2301 * The null signal is a permissions and process existence
2299 * probe. No signal is actually delivered. 2302 * probe. No signal is actually delivered.
@@ -2303,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2303 * signal is private anyway. 2306 * signal is private anyway.
2304 */ 2307 */
2305 if (!error && sig && lock_task_sighand(p, &flags)) { 2308 if (!error && sig && lock_task_sighand(p, &flags)) {
2306 error = specific_send_sig_info(sig, &info, p); 2309 error = specific_send_sig_info(sig, info, p);
2307 unlock_task_sighand(p, &flags); 2310 unlock_task_sighand(p, &flags);
2308 } 2311 }
2309 } 2312 }
@@ -2312,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2312 return error; 2315 return error;
2313} 2316}
2314 2317
2318static int do_tkill(pid_t tgid, pid_t pid, int sig)
2319{
2320 struct siginfo info;
2321
2322 info.si_signo = sig;
2323 info.si_errno = 0;
2324 info.si_code = SI_TKILL;
2325 info.si_pid = task_tgid_vnr(current);
2326 info.si_uid = current_uid();
2327
2328 return do_send_specific(tgid, pid, sig, &info);
2329}
2330
2315/** 2331/**
2316 * sys_tgkill - send signal to one specific thread 2332 * sys_tgkill - send signal to one specific thread
2317 * @tgid: the thread group ID of the thread 2333 * @tgid: the thread group ID of the thread
@@ -2361,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2361 return kill_proc_info(sig, &info, pid); 2377 return kill_proc_info(sig, &info, pid);
2362} 2378}
2363 2379
2380long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2381{
2382 /* This is only valid for single tasks */
2383 if (pid <= 0 || tgid <= 0)
2384 return -EINVAL;
2385
2386 /* Not even root can pretend to send signals from the kernel.
2387 Nor can they impersonate a kill(), which adds source info. */
2388 if (info->si_code >= 0)
2389 return -EPERM;
2390 info->si_signo = sig;
2391
2392 return do_send_specific(tgid, pid, sig, info);
2393}
2394
2395SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2396 siginfo_t __user *, uinfo)
2397{
2398 siginfo_t info;
2399
2400 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2401 return -EFAULT;
2402
2403 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2404}
2405
2364int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2406int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2365{ 2407{
2366 struct task_struct *t = current; 2408 struct task_struct *t = current;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
319EXPORT_SYMBOL(slow_work_enqueue); 319EXPORT_SYMBOL(slow_work_enqueue);
320 320
321/* 321/*
322 * Schedule a cull of the thread pool at some time in the near future
323 */
324static void slow_work_schedule_cull(void)
325{
326 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328}
329
330/*
322 * Worker thread culling algorithm 331 * Worker thread culling algorithm
323 */ 332 */
324static bool slow_work_cull_thread(void) 333static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
335 list_empty(&vslow_work_queue) && 344 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) > 345 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) { 346 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer, 347 slow_work_schedule_cull();
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true; 348 do_cull = true;
341 } 349 }
342 } 350 }
@@ -372,8 +380,8 @@ static int slow_work_thread(void *_data)
372 vsmax *= atomic_read(&slow_work_thread_count); 380 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100; 381 vsmax /= 100;
374 382
375 prepare_to_wait(&slow_work_thread_wq, &wait, 383 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE); 384 TASK_INTERRUPTIBLE);
377 if (!freezing(current) && 385 if (!freezing(current) &&
378 !slow_work_threads_should_exit && 386 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) && 387 !slow_work_available(vsmax) &&
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
393 list_empty(&vslow_work_queue) && 401 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) > 402 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads) 403 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer, 404 slow_work_schedule_cull();
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue; 405 continue;
399 } 406 }
400 407
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
458 if (atomic_dec_and_test(&slow_work_thread_count)) 465 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */ 466 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer, 467 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT); 468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
462 } else { 469 } else {
463 /* ratelimit the starting of new threads */ 470 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1); 471 mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
502 if (n < 0 && !slow_work_may_not_start_new_thread) 509 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread); 510 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0) 511 else if (n > 0)
505 mod_timer(&slow_work_cull_timer, 512 slow_work_schedule_cull();
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 } 513 }
508 mutex_unlock(&slow_work_user_lock); 514 mutex_unlock(&slow_work_user_lock);
509 } 515 }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
529 atomic_read(&slow_work_thread_count); 535 atomic_read(&slow_work_thread_count);
530 536
531 if (n < 0) 537 if (n < 0)
532 mod_timer(&slow_work_cull_timer, 538 slow_work_schedule_cull();
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 } 539 }
535 mutex_unlock(&slow_work_user_lock); 540 mutex_unlock(&slow_work_user_lock);
536 } 541 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 858baac568ee..ad63d8501207 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 switch (action) { 52 switch (action) {
53 case CPU_UP_PREPARE: 53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN: 54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 55 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu))) 56 cpu_to_node(cpu)))
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index dc4d0cfdcb2d..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 382
383EXPORT_SYMBOL(__tasklet_hi_schedule); 383EXPORT_SYMBOL(__tasklet_hi_schedule);
384 384
385void __tasklet_hi_schedule_first(struct tasklet_struct *t)
386{
387 BUG_ON(!irqs_disabled());
388
389 t->next = __get_cpu_var(tasklet_hi_vec).head;
390 __get_cpu_var(tasklet_hi_vec).head = t;
391 __raise_softirq_irqoff(HI_SOFTIRQ);
392}
393
394EXPORT_SYMBOL(__tasklet_hi_schedule_first);
395
385static void tasklet_action(struct softirq_action *a) 396static void tasklet_action(struct softirq_action *a)
386{ 397{
387 struct tasklet_struct *list; 398 struct tasklet_struct *list;
@@ -827,7 +838,7 @@ int __init __weak arch_early_irq_init(void)
827 return 0; 838 return 0;
828} 839}
829 840
830int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) 841int __weak arch_init_chip_data(struct irq_desc *desc, int node)
831{ 842{
832 return 0; 843 return 0;
833} 844}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1112,289 +1113,6 @@ out:
1112 return err; 1113 return err;
1113} 1114}
1114 1115
1115/*
1116 * Supplementary group IDs
1117 */
1118
1119/* init to 2 - one for init_task, one to ensure it is never freed */
1120struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1121
1122struct group_info *groups_alloc(int gidsetsize)
1123{
1124 struct group_info *group_info;
1125 int nblocks;
1126 int i;
1127
1128 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1129 /* Make sure we always allocate at least one indirect block pointer */
1130 nblocks = nblocks ? : 1;
1131 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1132 if (!group_info)
1133 return NULL;
1134 group_info->ngroups = gidsetsize;
1135 group_info->nblocks = nblocks;
1136 atomic_set(&group_info->usage, 1);
1137
1138 if (gidsetsize <= NGROUPS_SMALL)
1139 group_info->blocks[0] = group_info->small_block;
1140 else {
1141 for (i = 0; i < nblocks; i++) {
1142 gid_t *b;
1143 b = (void *)__get_free_page(GFP_USER);
1144 if (!b)
1145 goto out_undo_partial_alloc;
1146 group_info->blocks[i] = b;
1147 }
1148 }
1149 return group_info;
1150
1151out_undo_partial_alloc:
1152 while (--i >= 0) {
1153 free_page((unsigned long)group_info->blocks[i]);
1154 }
1155 kfree(group_info);
1156 return NULL;
1157}
1158
1159EXPORT_SYMBOL(groups_alloc);
1160
1161void groups_free(struct group_info *group_info)
1162{
1163 if (group_info->blocks[0] != group_info->small_block) {
1164 int i;
1165 for (i = 0; i < group_info->nblocks; i++)
1166 free_page((unsigned long)group_info->blocks[i]);
1167 }
1168 kfree(group_info);
1169}
1170
1171EXPORT_SYMBOL(groups_free);
1172
1173/* export the group_info to a user-space array */
1174static int groups_to_user(gid_t __user *grouplist,
1175 const struct group_info *group_info)
1176{
1177 int i;
1178 unsigned int count = group_info->ngroups;
1179
1180 for (i = 0; i < group_info->nblocks; i++) {
1181 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1182 unsigned int len = cp_count * sizeof(*grouplist);
1183
1184 if (copy_to_user(grouplist, group_info->blocks[i], len))
1185 return -EFAULT;
1186
1187 grouplist += NGROUPS_PER_BLOCK;
1188 count -= cp_count;
1189 }
1190 return 0;
1191}
1192
1193/* fill a group_info from a user-space array - it must be allocated already */
1194static int groups_from_user(struct group_info *group_info,
1195 gid_t __user *grouplist)
1196{
1197 int i;
1198 unsigned int count = group_info->ngroups;
1199
1200 for (i = 0; i < group_info->nblocks; i++) {
1201 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1202 unsigned int len = cp_count * sizeof(*grouplist);
1203
1204 if (copy_from_user(group_info->blocks[i], grouplist, len))
1205 return -EFAULT;
1206
1207 grouplist += NGROUPS_PER_BLOCK;
1208 count -= cp_count;
1209 }
1210 return 0;
1211}
1212
1213/* a simple Shell sort */
1214static void groups_sort(struct group_info *group_info)
1215{
1216 int base, max, stride;
1217 int gidsetsize = group_info->ngroups;
1218
1219 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1220 ; /* nothing */
1221 stride /= 3;
1222
1223 while (stride) {
1224 max = gidsetsize - stride;
1225 for (base = 0; base < max; base++) {
1226 int left = base;
1227 int right = left + stride;
1228 gid_t tmp = GROUP_AT(group_info, right);
1229
1230 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1231 GROUP_AT(group_info, right) =
1232 GROUP_AT(group_info, left);
1233 right = left;
1234 left -= stride;
1235 }
1236 GROUP_AT(group_info, right) = tmp;
1237 }
1238 stride /= 3;
1239 }
1240}
1241
1242/* a simple bsearch */
1243int groups_search(const struct group_info *group_info, gid_t grp)
1244{
1245 unsigned int left, right;
1246
1247 if (!group_info)
1248 return 0;
1249
1250 left = 0;
1251 right = group_info->ngroups;
1252 while (left < right) {
1253 unsigned int mid = (left+right)/2;
1254 int cmp = grp - GROUP_AT(group_info, mid);
1255 if (cmp > 0)
1256 left = mid + 1;
1257 else if (cmp < 0)
1258 right = mid;
1259 else
1260 return 1;
1261 }
1262 return 0;
1263}
1264
1265/**
1266 * set_groups - Change a group subscription in a set of credentials
1267 * @new: The newly prepared set of credentials to alter
1268 * @group_info: The group list to install
1269 *
1270 * Validate a group subscription and, if valid, insert it into a set
1271 * of credentials.
1272 */
1273int set_groups(struct cred *new, struct group_info *group_info)
1274{
1275 int retval;
1276
1277 retval = security_task_setgroups(group_info);
1278 if (retval)
1279 return retval;
1280
1281 put_group_info(new->group_info);
1282 groups_sort(group_info);
1283 get_group_info(group_info);
1284 new->group_info = group_info;
1285 return 0;
1286}
1287
1288EXPORT_SYMBOL(set_groups);
1289
1290/**
1291 * set_current_groups - Change current's group subscription
1292 * @group_info: The group list to impose
1293 *
1294 * Validate a group subscription and, if valid, impose it upon current's task
1295 * security record.
1296 */
1297int set_current_groups(struct group_info *group_info)
1298{
1299 struct cred *new;
1300 int ret;
1301
1302 new = prepare_creds();
1303 if (!new)
1304 return -ENOMEM;
1305
1306 ret = set_groups(new, group_info);
1307 if (ret < 0) {
1308 abort_creds(new);
1309 return ret;
1310 }
1311
1312 return commit_creds(new);
1313}
1314
1315EXPORT_SYMBOL(set_current_groups);
1316
1317SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1318{
1319 const struct cred *cred = current_cred();
1320 int i;
1321
1322 if (gidsetsize < 0)
1323 return -EINVAL;
1324
1325 /* no need to grab task_lock here; it cannot change */
1326 i = cred->group_info->ngroups;
1327 if (gidsetsize) {
1328 if (i > gidsetsize) {
1329 i = -EINVAL;
1330 goto out;
1331 }
1332 if (groups_to_user(grouplist, cred->group_info)) {
1333 i = -EFAULT;
1334 goto out;
1335 }
1336 }
1337out:
1338 return i;
1339}
1340
1341/*
1342 * SMP: Our groups are copy-on-write. We can set them safely
1343 * without another task interfering.
1344 */
1345
1346SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1347{
1348 struct group_info *group_info;
1349 int retval;
1350
1351 if (!capable(CAP_SETGID))
1352 return -EPERM;
1353 if ((unsigned)gidsetsize > NGROUPS_MAX)
1354 return -EINVAL;
1355
1356 group_info = groups_alloc(gidsetsize);
1357 if (!group_info)
1358 return -ENOMEM;
1359 retval = groups_from_user(group_info, grouplist);
1360 if (retval) {
1361 put_group_info(group_info);
1362 return retval;
1363 }
1364
1365 retval = set_current_groups(group_info);
1366 put_group_info(group_info);
1367
1368 return retval;
1369}
1370
1371/*
1372 * Check whether we're fsgid/egid or in the supplemental group..
1373 */
1374int in_group_p(gid_t grp)
1375{
1376 const struct cred *cred = current_cred();
1377 int retval = 1;
1378
1379 if (grp != cred->fsgid)
1380 retval = groups_search(cred->group_info, grp);
1381 return retval;
1382}
1383
1384EXPORT_SYMBOL(in_group_p);
1385
1386int in_egroup_p(gid_t grp)
1387{
1388 const struct cred *cred = current_cred();
1389 int retval = 1;
1390
1391 if (grp != cred->egid)
1392 retval = groups_search(cred->group_info, grp);
1393 return retval;
1394}
1395
1396EXPORT_SYMBOL(in_egroup_p);
1397
1398DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1399 1117
1400SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1793,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1511 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1512 error = SET_TSC_CTL(arg2);
1795 break; 1513 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE:
1515 error = perf_counter_task_disable();
1516 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE:
1518 error = perf_counter_task_enable();
1519 break;
1796 case PR_GET_TIMERSLACK: 1520 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1521 error = current->timer_slack_ns;
1798 break; 1522 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb76..ab462b9968d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -49,6 +50,7 @@
49#include <linux/reboot.h> 50#include <linux/reboot.h>
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
53#include <linux/perf_counter.h>
52 54
53#include <asm/uaccess.h> 55#include <asm/uaccess.h>
54#include <asm/processor.h> 56#include <asm/processor.h>
@@ -114,6 +116,7 @@ static int ngroups_max = NGROUPS_MAX;
114 116
115#ifdef CONFIG_MODULES 117#ifdef CONFIG_MODULES
116extern char modprobe_path[]; 118extern char modprobe_path[];
119extern int modules_disabled;
117#endif 120#endif
118#ifdef CONFIG_CHR_DEV_SG 121#ifdef CONFIG_CHR_DEV_SG
119extern int sg_big_buff; 122extern int sg_big_buff;
@@ -326,6 +329,14 @@ static struct ctl_table kern_table[] = {
326 .mode = 0644, 329 .mode = 0644,
327 .proc_handler = &proc_dointvec, 330 .proc_handler = &proc_dointvec,
328 }, 331 },
332 {
333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration",
335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
329#endif 340#endif
330 { 341 {
331 .ctl_name = CTL_UNNUMBERED, 342 .ctl_name = CTL_UNNUMBERED,
@@ -534,6 +545,17 @@ static struct ctl_table kern_table[] = {
534 .proc_handler = &proc_dostring, 545 .proc_handler = &proc_dostring,
535 .strategy = &sysctl_string, 546 .strategy = &sysctl_string,
536 }, 547 },
548 {
549 .ctl_name = CTL_UNNUMBERED,
550 .procname = "modules_disabled",
551 .data = &modules_disabled,
552 .maxlen = sizeof(int),
553 .mode = 0644,
554 /* only handle a transition from default "0" to "1" */
555 .proc_handler = &proc_dointvec_minmax,
556 .extra1 = &one,
557 .extra2 = &one,
558 },
537#endif 559#endif
538#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 560#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
539 { 561 {
@@ -731,6 +753,14 @@ static struct ctl_table kern_table[] = {
731 }, 753 },
732 { 754 {
733 .ctl_name = CTL_UNNUMBERED, 755 .ctl_name = CTL_UNNUMBERED,
756 .procname = "bootloader_version",
757 .data = &bootloader_version,
758 .maxlen = sizeof (int),
759 .mode = 0444,
760 .proc_handler = &proc_dointvec,
761 },
762 {
763 .ctl_name = CTL_UNNUMBERED,
734 .procname = "kstack_depth_to_print", 764 .procname = "kstack_depth_to_print",
735 .data = &kstack_depth_to_print, 765 .data = &kstack_depth_to_print,
736 .maxlen = sizeof(int), 766 .maxlen = sizeof(int),
@@ -912,6 +942,43 @@ static struct ctl_table kern_table[] = {
912 .child = slow_work_sysctls, 942 .child = slow_work_sysctls,
913 }, 943 },
914#endif 944#endif
945#ifdef CONFIG_PERF_COUNTERS
946 {
947 .ctl_name = CTL_UNNUMBERED,
948 .procname = "perf_counter_paranoid",
949 .data = &sysctl_perf_counter_paranoid,
950 .maxlen = sizeof(sysctl_perf_counter_paranoid),
951 .mode = 0644,
952 .proc_handler = &proc_dointvec,
953 },
954 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "perf_counter_mlock_kb",
957 .data = &sysctl_perf_counter_mlock,
958 .maxlen = sizeof(sysctl_perf_counter_mlock),
959 .mode = 0644,
960 .proc_handler = &proc_dointvec,
961 },
962 {
963 .ctl_name = CTL_UNNUMBERED,
964 .procname = "perf_counter_max_sample_rate",
965 .data = &sysctl_perf_counter_sample_rate,
966 .maxlen = sizeof(sysctl_perf_counter_sample_rate),
967 .mode = 0644,
968 .proc_handler = &proc_dointvec,
969 },
970#endif
971#ifdef CONFIG_KMEMCHECK
972 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "kmemcheck",
975 .data = &kmemcheck_enabled,
976 .maxlen = sizeof(int),
977 .mode = 0644,
978 .proc_handler = &proc_dointvec,
979 },
980#endif
981
915/* 982/*
916 * NOTE: do not add new entries to this table unless you have read 983 * NOTE: do not add new entries to this table unless you have read
917 * Documentation/sysctl/ctl_unnumbered.txt 984 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1225,7 +1292,6 @@ static struct ctl_table vm_table[] = {
1225 .strategy = &sysctl_jiffies, 1292 .strategy = &sysctl_jiffies,
1226 }, 1293 },
1227#endif 1294#endif
1228#ifdef CONFIG_SECURITY
1229 { 1295 {
1230 .ctl_name = CTL_UNNUMBERED, 1296 .ctl_name = CTL_UNNUMBERED,
1231 .procname = "mmap_min_addr", 1297 .procname = "mmap_min_addr",
@@ -1234,7 +1300,6 @@ static struct ctl_table vm_table[] = {
1234 .mode = 0644, 1300 .mode = 0644,
1235 .proc_handler = &proc_doulongvec_minmax, 1301 .proc_handler = &proc_doulongvec_minmax,
1236 }, 1302 },
1237#endif
1238#ifdef CONFIG_NUMA 1303#ifdef CONFIG_NUMA
1239 { 1304 {
1240 .ctl_name = CTL_UNNUMBERED, 1305 .ctl_name = CTL_UNNUMBERED,
@@ -1272,7 +1337,6 @@ static struct ctl_table vm_table[] = {
1272 .extra2 = &one, 1337 .extra2 = &one,
1273 }, 1338 },
1274#endif 1339#endif
1275#ifdef CONFIG_UNEVICTABLE_LRU
1276 { 1340 {
1277 .ctl_name = CTL_UNNUMBERED, 1341 .ctl_name = CTL_UNNUMBERED,
1278 .procname = "scan_unevictable_pages", 1342 .procname = "scan_unevictable_pages",
@@ -1281,7 +1345,6 @@ static struct ctl_table vm_table[] = {
1281 .mode = 0644, 1345 .mode = 0644,
1282 .proc_handler = &scan_unevictable_handler, 1346 .proc_handler = &scan_unevictable_handler,
1283 }, 1347 },
1284#endif
1285/* 1348/*
1286 * NOTE: do not add new entries to this table unless you have read 1349 * NOTE: do not add new entries to this table unless you have read
1287 * Documentation/sysctl/ctl_unnumbered.txt 1350 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..1ad6dd461119 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
21 22
22/* The registered clock event devices */ 23/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
54 55
55 return (unsigned long) clc; 56 return (unsigned long) clc;
56} 57}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns);
57 59
58/** 60/**
59 * clockevents_set_mode - set the operating mode of a clock event device 61 * clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +189,7 @@ void clockevents_register_device(struct clock_event_device *dev)
187 189
188 spin_unlock(&clockevents_lock); 190 spin_unlock(&clockevents_lock);
189} 191}
192EXPORT_SYMBOL_GPL(clockevents_register_device);
190 193
191/* 194/*
192 * Noop handler when we shut down an event device 195 * Noop handler when we shut down an event device
@@ -251,4 +254,15 @@ void clockevents_notify(unsigned long reason, void *arg)
251 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
252} 255}
253EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
254#endif 268#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0..592bf584d1d2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
402 unsigned long flags; 402 unsigned long flags;
403 int ret; 403 int ret;
404 404
405 /* save mult_orig on registration */
406 c->mult_orig = c->mult;
407
408 spin_lock_irqsave(&clocksource_lock, flags); 405 spin_lock_irqsave(&clocksource_lock, flags);
409 ret = clocksource_enqueue(c); 406 ret = clocksource_enqueue(c);
410 if (!ret) 407 if (!ret)
@@ -512,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
512 } 509 }
513 } 510 }
514 511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
515 /* Reselect, when the override name has changed */ 524 /* Reselect, when the override name has changed */
516 if (ovr != clocksource_override) { 525 if (ovr != clocksource_override) {
517 clocksource_override = ovr; 526 clocksource_override = ovr;
@@ -540,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
540 549
541 spin_lock_irq(&clocksource_lock); 550 spin_lock_irq(&clocksource_lock);
542 list_for_each_entry(src, &clocksource_list, list) { 551 list_for_each_entry(src, &clocksource_list, list) {
543 count += snprintf(buf + count, 552 /*
553 * Don't show non-HRES clocksource if the tick code is
554 * in one shot mode (highres=on or nohz=on)
555 */
556 if (!tick_oneshot_mode_active() ||
557 (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
558 count += snprintf(buf + count,
544 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
545 "%s ", src->name); 560 "%s ", src->name);
546 } 561 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..877dbedc3118 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
27 * timer stops in C3 state. 27 * timer stops in C3 state.
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
128 return 0; 128 return 0;
129} 129}
130 130
131/**
132 * tick_check_oneshot_mode - check whether the system is in oneshot mode
133 *
134 * returns 1 when either nohz or highres are enabled. otherwise 0.
135 */
136int tick_oneshot_mode_active(void)
137{
138 unsigned long flags;
139 int ret;
140
141 local_irq_save(flags);
142 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
143 local_irq_restore(flags);
144
145 return ret;
146}
147
131#ifdef CONFIG_HIGH_RES_TIMERS 148#ifdef CONFIG_HIGH_RES_TIMERS
132/** 149/**
133 * tick_init_highres - switch to high resolution mode 150 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..2aff39c6f10c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -349,7 +349,7 @@ void tick_nohz_stop_sched_tick(int inidle)
349 349
350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
351 hrtimer_start(&ts->sched_timer, expires, 351 hrtimer_start(&ts->sched_timer, expires,
352 HRTIMER_MODE_ABS); 352 HRTIMER_MODE_ABS_PINNED);
353 /* Check, if the timer was already in the past */ 353 /* Check, if the timer was already in the past */
354 if (hrtimer_active(&ts->sched_timer)) 354 if (hrtimer_active(&ts->sched_timer))
355 goto out; 355 goto out;
@@ -395,7 +395,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
395 395
396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
397 hrtimer_start_expires(&ts->sched_timer, 397 hrtimer_start_expires(&ts->sched_timer,
398 HRTIMER_MODE_ABS); 398 HRTIMER_MODE_ABS_PINNED);
399 /* Check, if the timer was already in the past */ 399 /* Check, if the timer was already in the past */
400 if (hrtimer_active(&ts->sched_timer)) 400 if (hrtimer_active(&ts->sched_timer))
401 break; 401 break;
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
698 698
699 for (;;) { 699 for (;;) {
700 hrtimer_forward(&ts->sched_timer, now, tick_period); 700 hrtimer_forward(&ts->sched_timer, now, tick_period);
701 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); 701 hrtimer_start_expires(&ts->sched_timer,
702 HRTIMER_MODE_ABS_PINNED);
702 /* Check, if the timer was already in the past */ 703 /* Check, if the timer was already in the past */
703 if (hrtimer_active(&ts->sched_timer)) 704 if (hrtimer_active(&ts->sched_timer))
704 break; 705 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..e8c77d9c633a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
77 clock->cycle_last = cycle_now; 77 clock->cycle_last = cycle_now;
78 78
79 nsec = cyc2ns(clock, cycle_delta); 79 nsec = cyc2ns(clock, cycle_delta);
80
81 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset();
83
80 timespec_add_ns(&xtime, nsec); 84 timespec_add_ns(&xtime, nsec);
81 85
82 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
111 /* convert to nanoseconds: */ 115 /* convert to nanoseconds: */
112 nsecs = cyc2ns(clock, cycle_delta); 116 nsecs = cyc2ns(clock, cycle_delta);
113 117
118 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset();
120
114 } while (read_seqretry(&xtime_lock, seq)); 121 } while (read_seqretry(&xtime_lock, seq));
115 122
116 timespec_add_ns(ts, nsecs); 123 timespec_add_ns(ts, nsecs);
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..54d3912f8cad 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
41#include <linux/sched.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42#include <asm/unistd.h> 44#include <asm/unistd.h>
@@ -604,13 +606,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
604} 606}
605 607
606static inline int 608static inline int
607__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) 609__mod_timer(struct timer_list *timer, unsigned long expires,
610 bool pending_only, int pinned)
608{ 611{
609 struct tvec_base *base, *new_base; 612 struct tvec_base *base, *new_base;
610 unsigned long flags; 613 unsigned long flags;
611 int ret; 614 int ret = 0 , cpu;
612
613 ret = 0;
614 615
615 timer_stats_timer_set_start_info(timer); 616 timer_stats_timer_set_start_info(timer);
616 BUG_ON(!timer->function); 617 BUG_ON(!timer->function);
@@ -629,6 +630,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
629 630
630 new_base = __get_cpu_var(tvec_bases); 631 new_base = __get_cpu_var(tvec_bases);
631 632
633 cpu = smp_processor_id();
634
635#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
636 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
637 int preferred_cpu = get_nohz_load_balancer();
638
639 if (preferred_cpu >= 0)
640 cpu = preferred_cpu;
641 }
642#endif
643 new_base = per_cpu(tvec_bases, cpu);
644
632 if (base != new_base) { 645 if (base != new_base) {
633 /* 646 /*
634 * We are trying to schedule the timer on the local CPU. 647 * We are trying to schedule the timer on the local CPU.
@@ -668,7 +681,7 @@ out_unlock:
668 */ 681 */
669int mod_timer_pending(struct timer_list *timer, unsigned long expires) 682int mod_timer_pending(struct timer_list *timer, unsigned long expires)
670{ 683{
671 return __mod_timer(timer, expires, true); 684 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
672} 685}
673EXPORT_SYMBOL(mod_timer_pending); 686EXPORT_SYMBOL(mod_timer_pending);
674 687
@@ -702,11 +715,33 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
702 if (timer->expires == expires && timer_pending(timer)) 715 if (timer->expires == expires && timer_pending(timer))
703 return 1; 716 return 1;
704 717
705 return __mod_timer(timer, expires, false); 718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
706} 719}
707EXPORT_SYMBOL(mod_timer); 720EXPORT_SYMBOL(mod_timer);
708 721
709/** 722/**
723 * mod_timer_pinned - modify a timer's timeout
724 * @timer: the timer to be modified
725 * @expires: new timeout in jiffies
726 *
727 * mod_timer_pinned() is a way to update the expire field of an
728 * active timer (if the timer is inactive it will be activated)
729 * and not allow the timer to be migrated to a different CPU.
730 *
731 * mod_timer_pinned(timer, expires) is equivalent to:
732 *
733 * del_timer(timer); timer->expires = expires; add_timer(timer);
734 */
735int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
736{
737 if (timer->expires == expires && timer_pending(timer))
738 return 1;
739
740 return __mod_timer(timer, expires, false, TIMER_PINNED);
741}
742EXPORT_SYMBOL(mod_timer_pinned);
743
744/**
710 * add_timer - start a timer 745 * add_timer - start a timer
711 * @timer: the timer to be added 746 * @timer: the timer to be added
712 * 747 *
@@ -756,6 +791,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
756 wake_up_idle_cpu(cpu); 791 wake_up_idle_cpu(cpu);
757 spin_unlock_irqrestore(&base->lock, flags); 792 spin_unlock_irqrestore(&base->lock, flags);
758} 793}
794EXPORT_SYMBOL_GPL(add_timer_on);
759 795
760/** 796/**
761 * del_timer - deactive a timer. 797 * del_timer - deactive a timer.
@@ -1015,6 +1051,9 @@ cascade:
1015 index = slot = timer_jiffies & TVN_MASK; 1051 index = slot = timer_jiffies & TVN_MASK;
1016 do { 1052 do {
1017 list_for_each_entry(nte, varp->vec + slot, entry) { 1053 list_for_each_entry(nte, varp->vec + slot, entry) {
1054 if (tbase_get_deferrable(nte->base))
1055 continue;
1056
1018 found = 1; 1057 found = 1;
1019 if (time_before(nte->expires, expires)) 1058 if (time_before(nte->expires, expires))
1020 expires = nte->expires; 1059 expires = nte->expires;
@@ -1123,53 +1162,14 @@ void update_process_times(int user_tick)
1123} 1162}
1124 1163
1125/* 1164/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1165 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1166 */
1169static void run_timer_softirq(struct softirq_action *h) 1167static void run_timer_softirq(struct softirq_action *h)
1170{ 1168{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1169 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1170
1171 perf_counter_do_pending();
1172
1173 hrtimer_run_pending(); 1173 hrtimer_run_pending();
1174 1174
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1175 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1187,6 @@ void run_local_timers(void)
1187} 1187}
1188 1188
1189/* 1189/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1190 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1191 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1192 * jiffies is defined in the linker script...
@@ -1205,7 +1195,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1195void do_timer(unsigned long ticks)
1206{ 1196{
1207 jiffies_64 += ticks; 1197 jiffies_64 += ticks;
1208 update_times(ticks); 1198 update_wall_time();
1199 calc_global_load();
1209} 1200}
1210 1201
1211#ifdef __ARCH_WANT_SYS_ALARM 1202#ifdef __ARCH_WANT_SYS_ALARM
@@ -1353,7 +1344,7 @@ signed long __sched schedule_timeout(signed long timeout)
1353 expire = timeout + jiffies; 1344 expire = timeout + jiffies;
1354 1345
1355 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1346 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1356 __mod_timer(&timer, expire, false); 1347 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1357 schedule(); 1348 schedule();
1358 del_singleshot_timer_sync(&timer); 1349 del_singleshot_timer_sync(&timer);
1359 1350
@@ -1406,37 +1397,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1397{
1407 unsigned long mem_total, sav_total; 1398 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1399 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1400 struct timespec tp;
1410 1401
1411 memset(info, 0, sizeof(struct sysinfo)); 1402 memset(info, 0, sizeof(struct sysinfo));
1412 1403
1413 do { 1404 ktime_get_ts(&tp);
1414 struct timespec tp; 1405 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1406 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1407
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1408 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1409
1438 info->procs = nr_threads; 1410 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1411
1441 si_meminfo(info); 1412 si_meminfo(info);
1442 si_swapinfo(info); 1413 si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d7f01e6e8ba5..ae048a2dbbe8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER
56 select MARKERS 56 select MARKERS
57 bool 57 bool
58 58
59# All tracer options should select GENERIC_TRACER. For those options that are
60# enabled by all tracers (context switch and event tracer) they select TRACING.
61# This allows those options to appear when no other tracer is selected. But the
62# options do not appear when something else selects it. We need the two options
63# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
64# hidding of the automatic options options.
65
59config TRACING 66config TRACING
60 bool 67 bool
61 select DEBUG_FS 68 select DEBUG_FS
@@ -66,6 +73,10 @@ config TRACING
66 select BINARY_PRINTF 73 select BINARY_PRINTF
67 select EVENT_TRACING 74 select EVENT_TRACING
68 75
76config GENERIC_TRACER
77 bool
78 select TRACING
79
69# 80#
70# Minimum requirements an architecture has to meet for us to 81# Minimum requirements an architecture has to meet for us to
71# be able to offer generic tracing facilities: 82# be able to offer generic tracing facilities:
@@ -95,7 +106,7 @@ config FUNCTION_TRACER
95 depends on HAVE_FUNCTION_TRACER 106 depends on HAVE_FUNCTION_TRACER
96 select FRAME_POINTER 107 select FRAME_POINTER
97 select KALLSYMS 108 select KALLSYMS
98 select TRACING 109 select GENERIC_TRACER
99 select CONTEXT_SWITCH_TRACER 110 select CONTEXT_SWITCH_TRACER
100 help 111 help
101 Enable the kernel to trace every kernel function. This is done 112 Enable the kernel to trace every kernel function. This is done
@@ -126,7 +137,7 @@ config IRQSOFF_TRACER
126 depends on TRACE_IRQFLAGS_SUPPORT 137 depends on TRACE_IRQFLAGS_SUPPORT
127 depends on GENERIC_TIME 138 depends on GENERIC_TIME
128 select TRACE_IRQFLAGS 139 select TRACE_IRQFLAGS
129 select TRACING 140 select GENERIC_TRACER
130 select TRACER_MAX_TRACE 141 select TRACER_MAX_TRACE
131 help 142 help
132 This option measures the time spent in irqs-off critical 143 This option measures the time spent in irqs-off critical
@@ -136,7 +147,7 @@ config IRQSOFF_TRACER
136 disabled by default and can be runtime (re-)started 147 disabled by default and can be runtime (re-)started
137 via: 148 via:
138 149
139 echo 0 > /debugfs/tracing/tracing_max_latency 150 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
140 151
141 (Note that kernel size and overhead increases with this option 152 (Note that kernel size and overhead increases with this option
142 enabled. This option and the preempt-off timing option can be 153 enabled. This option and the preempt-off timing option can be
@@ -147,7 +158,7 @@ config PREEMPT_TRACER
147 default n 158 default n
148 depends on GENERIC_TIME 159 depends on GENERIC_TIME
149 depends on PREEMPT 160 depends on PREEMPT
150 select TRACING 161 select GENERIC_TRACER
151 select TRACER_MAX_TRACE 162 select TRACER_MAX_TRACE
152 help 163 help
153 This option measures the time spent in preemption off critical 164 This option measures the time spent in preemption off critical
@@ -157,7 +168,7 @@ config PREEMPT_TRACER
157 disabled by default and can be runtime (re-)started 168 disabled by default and can be runtime (re-)started
158 via: 169 via:
159 170
160 echo 0 > /debugfs/tracing/tracing_max_latency 171 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
161 172
162 (Note that kernel size and overhead increases with this option 173 (Note that kernel size and overhead increases with this option
163 enabled. This option and the irqs-off timing option can be 174 enabled. This option and the irqs-off timing option can be
@@ -166,7 +177,7 @@ config PREEMPT_TRACER
166config SYSPROF_TRACER 177config SYSPROF_TRACER
167 bool "Sysprof Tracer" 178 bool "Sysprof Tracer"
168 depends on X86 179 depends on X86
169 select TRACING 180 select GENERIC_TRACER
170 select CONTEXT_SWITCH_TRACER 181 select CONTEXT_SWITCH_TRACER
171 help 182 help
172 This tracer provides the trace needed by the 'Sysprof' userspace 183 This tracer provides the trace needed by the 'Sysprof' userspace
@@ -174,44 +185,33 @@ config SYSPROF_TRACER
174 185
175config SCHED_TRACER 186config SCHED_TRACER
176 bool "Scheduling Latency Tracer" 187 bool "Scheduling Latency Tracer"
177 select TRACING 188 select GENERIC_TRACER
178 select CONTEXT_SWITCH_TRACER 189 select CONTEXT_SWITCH_TRACER
179 select TRACER_MAX_TRACE 190 select TRACER_MAX_TRACE
180 help 191 help
181 This tracer tracks the latency of the highest priority task 192 This tracer tracks the latency of the highest priority task
182 to be scheduled in, starting from the point it has woken up. 193 to be scheduled in, starting from the point it has woken up.
183 194
184config ENABLE_CONTEXT_SWITCH_TRACER 195config ENABLE_DEFAULT_TRACERS
185 bool "Trace process context switches" 196 bool "Trace process context switches and events"
186 select TRACING 197 depends on !GENERIC_TRACER
187 select CONTEXT_SWITCH_TRACER
188 help
189 This tracer gets called from the context switch and records
190 all switching of tasks.
191
192config ENABLE_EVENT_TRACING
193 bool "Trace various events in the kernel"
194 select TRACING 198 select TRACING
195 help 199 help
196 This tracer hooks to various trace points in the kernel 200 This tracer hooks to various trace points in the kernel
197 allowing the user to pick and choose which trace point they 201 allowing the user to pick and choose which trace point they
198 want to trace. 202 want to trace. It also includes the sched_switch tracer plugin.
199
200 Note, all tracers enable event tracing. This option is
201 only a convenience to enable event tracing when no other
202 tracers are selected.
203 203
204config FTRACE_SYSCALLS 204config FTRACE_SYSCALLS
205 bool "Trace syscalls" 205 bool "Trace syscalls"
206 depends on HAVE_FTRACE_SYSCALLS 206 depends on HAVE_FTRACE_SYSCALLS
207 select TRACING 207 select GENERIC_TRACER
208 select KALLSYMS 208 select KALLSYMS
209 help 209 help
210 Basic tracer to catch the syscall entry and exit events. 210 Basic tracer to catch the syscall entry and exit events.
211 211
212config BOOT_TRACER 212config BOOT_TRACER
213 bool "Trace boot initcalls" 213 bool "Trace boot initcalls"
214 select TRACING 214 select GENERIC_TRACER
215 select CONTEXT_SWITCH_TRACER 215 select CONTEXT_SWITCH_TRACER
216 help 216 help
217 This tracer helps developers to optimize boot times: it records 217 This tracer helps developers to optimize boot times: it records
@@ -228,7 +228,7 @@ config BOOT_TRACER
228 228
229config TRACE_BRANCH_PROFILING 229config TRACE_BRANCH_PROFILING
230 bool 230 bool
231 select TRACING 231 select GENERIC_TRACER
232 232
233choice 233choice
234 prompt "Branch Profiling" 234 prompt "Branch Profiling"
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
261 This tracer profiles all the the likely and unlikely macros 261 This tracer profiles all the the likely and unlikely macros
262 in the kernel. It will display the results in: 262 in the kernel. It will display the results in:
263 263
264 /debugfs/tracing/profile_annotated_branch 264 /sys/kernel/debug/tracing/profile_annotated_branch
265 265
266 Note: this will add a significant overhead, only turn this 266 Note: this will add a significant overhead, only turn this
267 on if you need to profile the system's use of these macros. 267 on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
274 taken in the kernel is recorded whether it hit or miss. 274 taken in the kernel is recorded whether it hit or miss.
275 The results will be displayed in: 275 The results will be displayed in:
276 276
277 /debugfs/tracing/profile_branch 277 /sys/kernel/debug/tracing/profile_branch
278 278
279 This option also enables the likely/unlikely profiler. 279 This option also enables the likely/unlikely profiler.
280 280
@@ -308,7 +308,7 @@ config BRANCH_TRACER
308config POWER_TRACER 308config POWER_TRACER
309 bool "Trace power consumption behavior" 309 bool "Trace power consumption behavior"
310 depends on X86 310 depends on X86
311 select TRACING 311 select GENERIC_TRACER
312 help 312 help
313 This tracer helps developers to analyze and optimize the kernels 313 This tracer helps developers to analyze and optimize the kernels
314 power management decisions, specifically the C-state and P-state 314 power management decisions, specifically the C-state and P-state
@@ -344,7 +344,7 @@ config STACK_TRACER
344 select KALLSYMS 344 select KALLSYMS
345 help 345 help
346 This special tracer records the maximum stack footprint of the 346 This special tracer records the maximum stack footprint of the
347 kernel and displays it in debugfs/tracing/stack_trace. 347 kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
348 348
349 This tracer works by hooking into every function call that the 349 This tracer works by hooking into every function call that the
350 kernel executes, and keeping a maximum stack depth value and 350 kernel executes, and keeping a maximum stack depth value and
@@ -363,14 +363,14 @@ config STACK_TRACER
363config HW_BRANCH_TRACER 363config HW_BRANCH_TRACER
364 depends on HAVE_HW_BRANCH_TRACER 364 depends on HAVE_HW_BRANCH_TRACER
365 bool "Trace hw branches" 365 bool "Trace hw branches"
366 select TRACING 366 select GENERIC_TRACER
367 help 367 help
368 This tracer records all branches on the system in a circular 368 This tracer records all branches on the system in a circular
369 buffer giving access to the last N branches for each cpu. 369 buffer giving access to the last N branches for each cpu.
370 370
371config KMEMTRACE 371config KMEMTRACE
372 bool "Trace SLAB allocations" 372 bool "Trace SLAB allocations"
373 select TRACING 373 select GENERIC_TRACER
374 help 374 help
375 kmemtrace provides tracing for slab allocator functions, such as 375 kmemtrace provides tracing for slab allocator functions, such as
376 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 376 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -390,7 +390,7 @@ config KMEMTRACE
390 390
391config WORKQUEUE_TRACER 391config WORKQUEUE_TRACER
392 bool "Trace workqueues" 392 bool "Trace workqueues"
393 select TRACING 393 select GENERIC_TRACER
394 help 394 help
395 The workqueue tracer provides some statistical informations 395 The workqueue tracer provides some statistical informations
396 about each cpu workqueue thread such as the number of the 396 about each cpu workqueue thread such as the number of the
@@ -406,7 +406,7 @@ config BLK_DEV_IO_TRACE
406 select RELAY 406 select RELAY
407 select DEBUG_FS 407 select DEBUG_FS
408 select TRACEPOINTS 408 select TRACEPOINTS
409 select TRACING 409 select GENERIC_TRACER
410 select STACKTRACE 410 select STACKTRACE
411 help 411 help
412 Say Y here if you want to be able to trace the block layer actions 412 Say Y here if you want to be able to trace the block layer actions
@@ -467,7 +467,7 @@ config FTRACE_SELFTEST
467 467
468config FTRACE_STARTUP_TEST 468config FTRACE_STARTUP_TEST
469 bool "Perform a startup test on ftrace" 469 bool "Perform a startup test on ftrace"
470 depends on TRACING 470 depends on GENERIC_TRACER
471 select FTRACE_SELFTEST 471 select FTRACE_SELFTEST
472 help 472 help
473 This option performs a series of startup tests on ftrace. On bootup 473 This option performs a series of startup tests on ftrace. On bootup
@@ -478,7 +478,7 @@ config FTRACE_STARTUP_TEST
478config MMIOTRACE 478config MMIOTRACE
479 bool "Memory mapped IO tracing" 479 bool "Memory mapped IO tracing"
480 depends on HAVE_MMIOTRACE_SUPPORT && PCI 480 depends on HAVE_MMIOTRACE_SUPPORT && PCI
481 select TRACING 481 select GENERIC_TRACER
482 help 482 help
483 Mmiotrace traces Memory Mapped I/O access and is meant for 483 Mmiotrace traces Memory Mapped I/O access and is meant for
484 debugging and reverse engineering. It is called from the ioremap 484 debugging and reverse engineering. It is called from the ioremap
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 658aace8c41e..ce3b1cd02732 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o 45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
49ifeq ($(CONFIG_BLOCK),y)
50obj-$(CONFIG_EVENT_TRACING) += blktrace.o
51endif
49obj-$(CONFIG_EVENT_TRACING) += trace_events.o 52obj-$(CONFIG_EVENT_TRACING) += trace_events.o
50obj-$(CONFIG_EVENT_TRACING) += trace_export.o 53obj-$(CONFIG_EVENT_TRACING) += trace_export.o
51obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e5..39af8af6fc30 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27
28#include <trace/events/block.h>
29
28#include "trace_output.h" 30#include "trace_output.h"
29 31
32#ifdef CONFIG_BLK_DEV_IO_TRACE
33
30static unsigned int blktrace_seq __read_mostly = 1; 34static unsigned int blktrace_seq __read_mostly = 1;
31 35
32static struct trace_array *blk_tr; 36static struct trace_array *blk_tr;
@@ -665,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
665 669
666 if (blk_pc_request(rq)) { 670 if (blk_pc_request(rq)) {
667 what |= BLK_TC_ACT(BLK_TC_PC); 671 what |= BLK_TC_ACT(BLK_TC_PC);
668 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, 672 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
669 rq->cmd_len, rq->cmd); 673 what, rq->errors, rq->cmd_len, rq->cmd);
670 } else { 674 } else {
671 what |= BLK_TC_ACT(BLK_TC_FS); 675 what |= BLK_TC_ACT(BLK_TC_FS);
672 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 676 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
673 rw, what, rq->errors, 0, NULL); 677 what, rq->errors, 0, NULL);
674 } 678 }
675} 679}
676 680
@@ -877,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q,
877 return; 881 return;
878 882
879 if (blk_pc_request(rq)) 883 if (blk_pc_request(rq))
880 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, 884 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
881 rq->errors, len, data); 885 BLK_TA_DRV_DATA, rq->errors, len, data);
882 else 886 else
883 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 887 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
884 0, BLK_TA_DRV_DATA, rq->errors, len, data); 888 BLK_TA_DRV_DATA, rq->errors, len, data);
885} 889}
886EXPORT_SYMBOL_GPL(blk_add_driver_data); 890EXPORT_SYMBOL_GPL(blk_add_driver_data);
887 891
@@ -1658,3 +1662,72 @@ int blk_trace_init_sysfs(struct device *dev)
1658 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1662 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1659} 1663}
1660 1664
1665#endif /* CONFIG_BLK_DEV_IO_TRACE */
1666
1667#ifdef CONFIG_EVENT_TRACING
1668
1669void blk_dump_cmd(char *buf, struct request *rq)
1670{
1671 int i, end;
1672 int len = rq->cmd_len;
1673 unsigned char *cmd = rq->cmd;
1674
1675 if (!blk_pc_request(rq)) {
1676 buf[0] = '\0';
1677 return;
1678 }
1679
1680 for (end = len - 1; end >= 0; end--)
1681 if (cmd[end])
1682 break;
1683 end++;
1684
1685 for (i = 0; i < len; i++) {
1686 buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
1687 if (i == end && end != len - 1) {
1688 sprintf(buf, " ..");
1689 break;
1690 }
1691 }
1692}
1693
1694void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1695{
1696 int i = 0;
1697
1698 if (rw & WRITE)
1699 rwbs[i++] = 'W';
1700 else if (rw & 1 << BIO_RW_DISCARD)
1701 rwbs[i++] = 'D';
1702 else if (bytes)
1703 rwbs[i++] = 'R';
1704 else
1705 rwbs[i++] = 'N';
1706
1707 if (rw & 1 << BIO_RW_AHEAD)
1708 rwbs[i++] = 'A';
1709 if (rw & 1 << BIO_RW_BARRIER)
1710 rwbs[i++] = 'B';
1711 if (rw & 1 << BIO_RW_SYNCIO)
1712 rwbs[i++] = 'S';
1713 if (rw & 1 << BIO_RW_META)
1714 rwbs[i++] = 'M';
1715
1716 rwbs[i] = '\0';
1717}
1718
1719void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1720{
1721 int rw = rq->cmd_flags & 0x03;
1722 int bytes;
1723
1724 if (blk_discard_rq(rq))
1725 rw |= (1 << BIO_RW_DISCARD);
1726
1727 bytes = blk_rq_bytes(rq);
1728
1729 blk_fill_rwbs(rwbs, rw, bytes);
1730}
1731
1732#endif /* CONFIG_EVENT_TRACING */
1733
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 140699a9a8a7..bb60732ade0c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
32#include <trace/events/sched.h> 32#include <trace/events/sched.h>
33 33
34#include <asm/ftrace.h> 34#include <asm/ftrace.h>
35#include <asm/setup.h>
35 36
36#include "trace_output.h" 37#include "trace_output.h"
37#include "trace_stat.h" 38#include "trace_stat.h"
@@ -598,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
598 local_irq_save(flags); 599 local_irq_save(flags);
599 600
600 stat = &__get_cpu_var(ftrace_profile_stats); 601 stat = &__get_cpu_var(ftrace_profile_stats);
601 if (!stat->hash) 602 if (!stat->hash || !ftrace_profile_enabled)
602 goto out; 603 goto out;
603 604
604 rec = ftrace_find_profiled_func(stat, ip); 605 rec = ftrace_find_profiled_func(stat, ip);
@@ -629,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
629 630
630 local_irq_save(flags); 631 local_irq_save(flags);
631 stat = &__get_cpu_var(ftrace_profile_stats); 632 stat = &__get_cpu_var(ftrace_profile_stats);
632 if (!stat->hash) 633 if (!stat->hash || !ftrace_profile_enabled)
633 goto out; 634 goto out;
634 635
635 calltime = trace->rettime - trace->calltime; 636 calltime = trace->rettime - trace->calltime;
@@ -723,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
723 ftrace_profile_enabled = 1; 724 ftrace_profile_enabled = 1;
724 } else { 725 } else {
725 ftrace_profile_enabled = 0; 726 ftrace_profile_enabled = 0;
727 /*
728 * unregister_ftrace_profiler calls stop_machine
729 * so this acts like an synchronize_sched.
730 */
726 unregister_ftrace_profiler(); 731 unregister_ftrace_profiler();
727 } 732 }
728 } 733 }
@@ -2369,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
2369 ftrace_set_regex(buf, len, reset, 0); 2374 ftrace_set_regex(buf, len, reset, 0);
2370} 2375}
2371 2376
2377/*
2378 * command line interface to allow users to set filters on boot up.
2379 */
2380#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2381static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2382static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2383
2384static int __init set_ftrace_notrace(char *str)
2385{
2386 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
2387 return 1;
2388}
2389__setup("ftrace_notrace=", set_ftrace_notrace);
2390
2391static int __init set_ftrace_filter(char *str)
2392{
2393 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
2394 return 1;
2395}
2396__setup("ftrace_filter=", set_ftrace_filter);
2397
2398static void __init set_ftrace_early_filter(char *buf, int enable)
2399{
2400 char *func;
2401
2402 while (buf) {
2403 func = strsep(&buf, ",");
2404 ftrace_set_regex(func, strlen(func), 0, enable);
2405 }
2406}
2407
2408static void __init set_ftrace_early_filters(void)
2409{
2410 if (ftrace_filter_buf[0])
2411 set_ftrace_early_filter(ftrace_filter_buf, 1);
2412 if (ftrace_notrace_buf[0])
2413 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2414}
2415
2372static int 2416static int
2373ftrace_regex_release(struct inode *inode, struct file *file, int enable) 2417ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2374{ 2418{
@@ -2829,6 +2873,8 @@ void __init ftrace_init(void)
2829 if (ret) 2873 if (ret)
2830 pr_warning("Failed to register trace ftrace module notifier\n"); 2874 pr_warning("Failed to register trace ftrace module notifier\n");
2831 2875
2876 set_ftrace_early_filters();
2877
2832 return; 2878 return;
2833 failed: 2879 failed:
2834 ftrace_disabled = 1; 2880 ftrace_disabled = 1;
@@ -3172,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
3172 } 3218 }
3173 3219
3174 if (t->ret_stack == NULL) { 3220 if (t->ret_stack == NULL) {
3175 t->curr_ret_stack = -1;
3176 /* Make sure IRQs see the -1 first: */
3177 barrier();
3178 t->ret_stack = ret_stack_list[start++];
3179 atomic_set(&t->tracing_graph_pause, 0); 3221 atomic_set(&t->tracing_graph_pause, 0);
3180 atomic_set(&t->trace_overrun, 0); 3222 atomic_set(&t->trace_overrun, 0);
3223 t->curr_ret_stack = -1;
3224 /* Make sure the tasks see the -1 first: */
3225 smp_wmb();
3226 t->ret_stack = ret_stack_list[start++];
3181 } 3227 }
3182 } while_each_thread(g, t); 3228 } while_each_thread(g, t);
3183 3229
@@ -3235,8 +3281,10 @@ static int start_graph_tracing(void)
3235 return -ENOMEM; 3281 return -ENOMEM;
3236 3282
3237 /* The cpu_boot init_task->ret_stack will never be freed */ 3283 /* The cpu_boot init_task->ret_stack will never be freed */
3238 for_each_online_cpu(cpu) 3284 for_each_online_cpu(cpu) {
3239 ftrace_graph_init_task(idle_task(cpu)); 3285 if (!idle_task(cpu)->ret_stack)
3286 ftrace_graph_init_task(idle_task(cpu));
3287 }
3240 3288
3241 do { 3289 do {
3242 ret = alloc_retstack_tasklist(ret_stack_list); 3290 ret = alloc_retstack_tasklist(ret_stack_list);
@@ -3328,18 +3376,25 @@ void unregister_ftrace_graph(void)
3328/* Allocate a return stack for newly created task */ 3376/* Allocate a return stack for newly created task */
3329void ftrace_graph_init_task(struct task_struct *t) 3377void ftrace_graph_init_task(struct task_struct *t)
3330{ 3378{
3379 /* Make sure we do not use the parent ret_stack */
3380 t->ret_stack = NULL;
3381
3331 if (ftrace_graph_active) { 3382 if (ftrace_graph_active) {
3332 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 3383 struct ftrace_ret_stack *ret_stack;
3384
3385 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3333 * sizeof(struct ftrace_ret_stack), 3386 * sizeof(struct ftrace_ret_stack),
3334 GFP_KERNEL); 3387 GFP_KERNEL);
3335 if (!t->ret_stack) 3388 if (!ret_stack)
3336 return; 3389 return;
3337 t->curr_ret_stack = -1; 3390 t->curr_ret_stack = -1;
3338 atomic_set(&t->tracing_graph_pause, 0); 3391 atomic_set(&t->tracing_graph_pause, 0);
3339 atomic_set(&t->trace_overrun, 0); 3392 atomic_set(&t->trace_overrun, 0);
3340 t->ftrace_timestamp = 0; 3393 t->ftrace_timestamp = 0;
3341 } else 3394 /* make curr_ret_stack visable before we add the ret_stack */
3342 t->ret_stack = NULL; 3395 smp_wmb();
3396 t->ret_stack = ret_stack;
3397 }
3343} 3398}
3344 3399
3345void ftrace_graph_exit_task(struct task_struct *t) 3400void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b24d49604c..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -370,6 +371,9 @@ static inline int test_time_stamp(u64 delta)
370/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 371/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
371#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 372#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
372 373
374/* Max number of timestamps that can fit on a page */
375#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
376
373int ring_buffer_print_page_header(struct trace_seq *s) 377int ring_buffer_print_page_header(struct trace_seq *s)
374{ 378{
375 struct buffer_data_page field; 379 struct buffer_data_page field;
@@ -423,6 +427,8 @@ struct ring_buffer {
423 atomic_t record_disabled; 427 atomic_t record_disabled;
424 cpumask_var_t cpumask; 428 cpumask_var_t cpumask;
425 429
430 struct lock_class_key *reader_lock_key;
431
426 struct mutex mutex; 432 struct mutex mutex;
427 433
428 struct ring_buffer_per_cpu **buffers; 434 struct ring_buffer_per_cpu **buffers;
@@ -562,6 +568,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
562 cpu_buffer->cpu = cpu; 568 cpu_buffer->cpu = cpu;
563 cpu_buffer->buffer = buffer; 569 cpu_buffer->buffer = buffer;
564 spin_lock_init(&cpu_buffer->reader_lock); 570 spin_lock_init(&cpu_buffer->reader_lock);
571 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
565 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 572 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
566 INIT_LIST_HEAD(&cpu_buffer->pages); 573 INIT_LIST_HEAD(&cpu_buffer->pages);
567 574
@@ -632,7 +639,8 @@ static int rb_cpu_notify(struct notifier_block *self,
632 * when the buffer wraps. If this flag is not set, the buffer will 639 * when the buffer wraps. If this flag is not set, the buffer will
633 * drop data when the tail hits the head. 640 * drop data when the tail hits the head.
634 */ 641 */
635struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 642struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
643 struct lock_class_key *key)
636{ 644{
637 struct ring_buffer *buffer; 645 struct ring_buffer *buffer;
638 int bsize; 646 int bsize;
@@ -655,6 +663,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
655 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 663 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
656 buffer->flags = flags; 664 buffer->flags = flags;
657 buffer->clock = trace_clock_local; 665 buffer->clock = trace_clock_local;
666 buffer->reader_lock_key = key;
658 667
659 /* need at least two pages */ 668 /* need at least two pages */
660 if (buffer->pages == 1) 669 if (buffer->pages == 1)
@@ -712,7 +721,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
712 kfree(buffer); 721 kfree(buffer);
713 return NULL; 722 return NULL;
714} 723}
715EXPORT_SYMBOL_GPL(ring_buffer_alloc); 724EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
716 725
717/** 726/**
718 * ring_buffer_free - free a ring buffer. 727 * ring_buffer_free - free a ring buffer.
@@ -1262,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1262 if (tail < BUF_PAGE_SIZE) { 1271 if (tail < BUF_PAGE_SIZE) {
1263 /* Mark the rest of the page with padding */ 1272 /* Mark the rest of the page with padding */
1264 event = __rb_page_index(tail_page, tail); 1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1265 rb_event_set_padding(event); 1275 rb_event_set_padding(event);
1266 } 1276 }
1267 1277
@@ -1319,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1319 return NULL; 1329 return NULL;
1320 1330
1321 event = __rb_page_index(tail_page, tail); 1331 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield);
1322 rb_update_event(event, type, length); 1333 rb_update_event(event, type, length);
1323 1334
1324 /* The passed in type is zero for DATA */ 1335 /* The passed in type is zero for DATA */
@@ -1335,6 +1346,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1335 return event; 1346 return event;
1336} 1347}
1337 1348
1349static inline int
1350rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1351 struct ring_buffer_event *event)
1352{
1353 unsigned long new_index, old_index;
1354 struct buffer_page *bpage;
1355 unsigned long index;
1356 unsigned long addr;
1357
1358 new_index = rb_event_index(event);
1359 old_index = new_index + rb_event_length(event);
1360 addr = (unsigned long)event;
1361 addr &= PAGE_MASK;
1362
1363 bpage = cpu_buffer->tail_page;
1364
1365 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1366 /*
1367 * This is on the tail page. It is possible that
1368 * a write could come in and move the tail page
1369 * and write to the next page. That is fine
1370 * because we just shorten what is on this page.
1371 */
1372 index = local_cmpxchg(&bpage->write, old_index, new_index);
1373 if (index == old_index)
1374 return 1;
1375 }
1376
1377 /* could not discard */
1378 return 0;
1379}
1380
1338static int 1381static int
1339rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1382rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1340 u64 *ts, u64 *delta) 1383 u64 *ts, u64 *delta)
@@ -1377,16 +1420,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1377 event->array[0] = *delta >> TS_SHIFT; 1420 event->array[0] = *delta >> TS_SHIFT;
1378 } else { 1421 } else {
1379 cpu_buffer->commit_page->page->time_stamp = *ts; 1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1380 event->time_delta = 0; 1423 /* try to discard, since we do not need this */
1381 event->array[0] = 0; 1424 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */
1426 event->time_delta = 0;
1427 event->array[0] = 0;
1428 }
1382 } 1429 }
1383 cpu_buffer->write_stamp = *ts; 1430 cpu_buffer->write_stamp = *ts;
1384 /* let the caller know this was the commit */ 1431 /* let the caller know this was the commit */
1385 ret = 1; 1432 ret = 1;
1386 } else { 1433 } else {
1387 /* Darn, this is just wasted space */ 1434 /* Try to discard the event */
1388 event->time_delta = 0; 1435 if (!rb_try_to_discard(cpu_buffer, event)) {
1389 event->array[0] = 0; 1436 /* Darn, this is just wasted space */
1437 event->time_delta = 0;
1438 event->array[0] = 0;
1439 }
1390 ret = 0; 1440 ret = 0;
1391 } 1441 }
1392 1442
@@ -1682,10 +1732,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1682 struct ring_buffer_event *event) 1732 struct ring_buffer_event *event)
1683{ 1733{
1684 struct ring_buffer_per_cpu *cpu_buffer; 1734 struct ring_buffer_per_cpu *cpu_buffer;
1685 unsigned long new_index, old_index;
1686 struct buffer_page *bpage;
1687 unsigned long index;
1688 unsigned long addr;
1689 int cpu; 1735 int cpu;
1690 1736
1691 /* The event is discarded regardless */ 1737 /* The event is discarded regardless */
@@ -1701,24 +1747,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1701 cpu = smp_processor_id(); 1747 cpu = smp_processor_id();
1702 cpu_buffer = buffer->buffers[cpu]; 1748 cpu_buffer = buffer->buffers[cpu];
1703 1749
1704 new_index = rb_event_index(event); 1750 if (!rb_try_to_discard(cpu_buffer, event))
1705 old_index = new_index + rb_event_length(event); 1751 goto out;
1706 addr = (unsigned long)event;
1707 addr &= PAGE_MASK;
1708
1709 bpage = cpu_buffer->tail_page;
1710
1711 if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
1712 /*
1713 * This is on the tail page. It is possible that
1714 * a write could come in and move the tail page
1715 * and write to the next page. That is fine
1716 * because we just shorten what is on this page.
1717 */
1718 index = local_cmpxchg(&bpage->write, old_index, new_index);
1719 if (index == old_index)
1720 goto out;
1721 }
1722 1752
1723 /* 1753 /*
1724 * The commit is still visible by the reader, so we 1754 * The commit is still visible by the reader, so we
@@ -2253,8 +2283,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2253 * Check if we are at the end of the buffer. 2283 * Check if we are at the end of the buffer.
2254 */ 2284 */
2255 if (iter->head >= rb_page_size(iter->head_page)) { 2285 if (iter->head >= rb_page_size(iter->head_page)) {
2256 if (RB_WARN_ON(buffer, 2286 /* discarded commits can make the page empty */
2257 iter->head_page == cpu_buffer->commit_page)) 2287 if (iter->head_page == cpu_buffer->commit_page)
2258 return; 2288 return;
2259 rb_inc_iter(iter); 2289 rb_inc_iter(iter);
2260 return; 2290 return;
@@ -2297,12 +2327,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2297 /* 2327 /*
2298 * We repeat when a timestamp is encountered. It is possible 2328 * We repeat when a timestamp is encountered. It is possible
2299 * to get multiple timestamps from an interrupt entering just 2329 * to get multiple timestamps from an interrupt entering just
2300 * as one timestamp is about to be written. The max times 2330 * as one timestamp is about to be written, or from discarded
2301 * that this can happen is the number of nested interrupts we 2331 * commits. The most that we can have is the number on a single page.
2302 * can have. Nesting 10 deep of interrupts is clearly
2303 * an anomaly.
2304 */ 2332 */
2305 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2333 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2306 return NULL; 2334 return NULL;
2307 2335
2308 reader = rb_get_reader_page(cpu_buffer); 2336 reader = rb_get_reader_page(cpu_buffer);
@@ -2368,14 +2396,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2368 2396
2369 again: 2397 again:
2370 /* 2398 /*
2371 * We repeat when a timestamp is encountered. It is possible 2399 * We repeat when a timestamp is encountered.
2372 * to get multiple timestamps from an interrupt entering just 2400 * We can get multiple timestamps by nested interrupts or also
2373 * as one timestamp is about to be written. The max times 2401 * if filtering is on (discarding commits). Since discarding
2374 * that this can happen is the number of nested interrupts we 2402 * commits can be frequent we can get a lot of timestamps.
2375 * can have. Nesting 10 deep of interrupts is clearly 2403 * But we limit them by not adding timestamps if they begin
2376 * an anomaly. 2404 * at the start of a page.
2377 */ 2405 */
2378 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2406 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2379 return NULL; 2407 return NULL;
2380 2408
2381 if (rb_per_cpu_empty(cpu_buffer)) 2409 if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87d7e91..c1878bfb2e1e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
344/* 344/*
345 * Copy the new maximum trace into the separate maximum-trace 345 * Copy the new maximum trace into the separate maximum-trace
346 * structure. (this way the maximum trace is permanently saved, 346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /debugfs/tracing/latency_trace) 347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */ 348 */
349static void 349static void
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
2414 2414
2415static const char readme_msg[] = 2415static const char readme_msg[] =
2416 "tracing mini-HOWTO:\n\n" 2416 "tracing mini-HOWTO:\n\n"
2417 "# mkdir /debug\n" 2417 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2418 "# mount -t debugfs nodev /debug\n\n" 2418 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2419 "# cat /debug/tracing/available_tracers\n"
2420 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2419 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2421 "# cat /debug/tracing/current_tracer\n" 2420 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2422 "nop\n" 2421 "nop\n"
2423 "# echo sched_switch > /debug/tracing/current_tracer\n" 2422 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2424 "# cat /debug/tracing/current_tracer\n" 2423 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2425 "sched_switch\n" 2424 "sched_switch\n"
2426 "# cat /debug/tracing/trace_options\n" 2425 "# cat /sys/kernel/debug/tracing/trace_options\n"
2427 "noprint-parent nosym-offset nosym-addr noverbose\n" 2426 "noprint-parent nosym-offset nosym-addr noverbose\n"
2428 "# echo print-parent > /debug/tracing/trace_options\n" 2427 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2429 "# echo 1 > /debug/tracing/tracing_enabled\n" 2428 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2430 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2429 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2431 "echo 0 > /debug/tracing/tracing_enabled\n" 2430 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2432; 2431;
2433 2432
2434static ssize_t 2433static ssize_t
@@ -2826,6 +2825,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2826 /* trace pipe does not show start of buffer */ 2825 /* trace pipe does not show start of buffer */
2827 cpumask_setall(iter->started); 2826 cpumask_setall(iter->started);
2828 2827
2828 if (trace_flags & TRACE_ITER_LATENCY_FMT)
2829 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2830
2829 iter->cpu_file = cpu_file; 2831 iter->cpu_file = cpu_file;
2830 iter->tr = &global_trace; 2832 iter->tr = &global_trace;
2831 mutex_init(&iter->mutex); 2833 mutex_init(&iter->mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c81f9c21426..aa08be69a1b6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod)
1050 struct ftrace_event_call *call, *p; 1050 struct ftrace_event_call *call, *p;
1051 bool found = false; 1051 bool found = false;
1052 1052
1053 down_write(&trace_event_mutex);
1053 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1054 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1054 if (call->mod == mod) { 1055 if (call->mod == mod) {
1055 found = true; 1056 found = true;
1056 ftrace_event_enable_disable(call, 0); 1057 ftrace_event_enable_disable(call, 0);
1057 if (call->event) 1058 if (call->event)
1058 unregister_ftrace_event(call->event); 1059 __unregister_ftrace_event(call->event);
1059 debugfs_remove_recursive(call->dir); 1060 debugfs_remove_recursive(call->dir);
1060 list_del(&call->list); 1061 list_del(&call->list);
1061 trace_destroy_fields(call); 1062 trace_destroy_fields(call);
@@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod)
1079 */ 1080 */
1080 if (found) 1081 if (found)
1081 tracing_reset_current_online_cpus(); 1082 tracing_reset_current_online_cpus();
1083 up_write(&trace_event_mutex);
1082} 1084}
1083 1085
1084static int trace_module_notify(struct notifier_block *self, 1086static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a7430b16d243..db6e54bdb596 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -478,12 +478,12 @@ enum {
478 478
479static int is_string_field(const char *type) 479static int is_string_field(const char *type)
480{ 480{
481 if (strstr(type, "__data_loc") && strstr(type, "char"))
482 return FILTER_DYN_STRING;
483
481 if (strchr(type, '[') && strstr(type, "char")) 484 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 485 return FILTER_STATIC_STRING;
483 486
484 if (!strcmp(type, "__str_loc"))
485 return FILTER_DYN_STRING;
486
487 return 0; 487 return 0;
488} 488}
489 489
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 10f6ad7d85f6..8b592418d8b2 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
65 if (!current->ret_stack) 65 if (!current->ret_stack)
66 return -EBUSY; 66 return -EBUSY;
67 67
68 /*
69 * We must make sure the ret_stack is tested before we read
70 * anything else.
71 */
72 smp_rmb();
73
68 /* The return trace stack is full */ 74 /* The return trace stack is full */
69 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { 75 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
70 atomic_inc(&current->trace_overrun); 76 atomic_inc(&current->trace_overrun);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c12d95db2f56..7938f3ae93e3 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,9 +14,10 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17static DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); 19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
20 21
21static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
22 23
@@ -99,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
99} 100}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 101EXPORT_SYMBOL_GPL(trace_seq_printf);
101 102
103/**
104 * trace_seq_vprintf - sequence printing of trace information
105 * @s: trace sequence descriptor
106 * @fmt: printf format string
107 *
108 * The tracer may use either sequence operations or its own
109 * copy to user routines. To simplify formating of a trace
110 * trace_seq_printf is used to store strings into a special
111 * buffer (@s). Then the output may be either used by
112 * the sequencer or pulled into another buffer.
113 */
114int
115trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
116{
117 int len = (PAGE_SIZE - 1) - s->len;
118 int ret;
119
120 if (!len)
121 return 0;
122
123 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
124
125 /* If we can't write it all, don't bother writing anything */
126 if (ret >= len)
127 return 0;
128
129 s->len += ret;
130
131 return len;
132}
133EXPORT_SYMBOL_GPL(trace_seq_vprintf);
134
102int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 135int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
103{ 136{
104 int len = (PAGE_SIZE - 1) - s->len; 137 int len = (PAGE_SIZE - 1) - s->len;
@@ -222,10 +255,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
222{ 255{
223 unsigned long mask; 256 unsigned long mask;
224 const char *str; 257 const char *str;
258 const char *ret = p->buffer + p->len;
225 int i; 259 int i;
226 260
227 trace_seq_init(p);
228
229 for (i = 0; flag_array[i].name && flags; i++) { 261 for (i = 0; flag_array[i].name && flags; i++) {
230 262
231 mask = flag_array[i].mask; 263 mask = flag_array[i].mask;
@@ -248,16 +280,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
248 280
249 trace_seq_putc(p, 0); 281 trace_seq_putc(p, 0);
250 282
251 return p->buffer; 283 return ret;
252} 284}
285EXPORT_SYMBOL(ftrace_print_flags_seq);
253 286
254const char * 287const char *
255ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, 288ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
256 const struct trace_print_flags *symbol_array) 289 const struct trace_print_flags *symbol_array)
257{ 290{
258 int i; 291 int i;
259 292 const char *ret = p->buffer + p->len;
260 trace_seq_init(p);
261 293
262 for (i = 0; symbol_array[i].name; i++) { 294 for (i = 0; symbol_array[i].name; i++) {
263 295
@@ -273,8 +305,9 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
273 305
274 trace_seq_putc(p, 0); 306 trace_seq_putc(p, 0);
275 307
276 return p->buffer; 308 return ret;
277} 309}
310EXPORT_SYMBOL(ftrace_print_symbols_seq);
278 311
279#ifdef CONFIG_KRETPROBES 312#ifdef CONFIG_KRETPROBES
280static inline const char *kretprobed(const char *name) 313static inline const char *kretprobed(const char *name)
@@ -386,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
386 419
387 if (ip == ULONG_MAX || !ret) 420 if (ip == ULONG_MAX || !ret)
388 break; 421 break;
389 if (i && ret) 422 if (ret)
390 ret = trace_seq_puts(s, " <- "); 423 ret = trace_seq_puts(s, " => ");
391 if (!ip) { 424 if (!ip) {
392 if (ret) 425 if (ret)
393 ret = trace_seq_puts(s, "??"); 426 ret = trace_seq_puts(s, "??");
427 if (ret)
428 ret = trace_seq_puts(s, "\n");
394 continue; 429 continue;
395 } 430 }
396 if (!ret) 431 if (!ret)
397 break; 432 break;
398 if (ret) 433 if (ret)
399 ret = seq_print_user_ip(s, mm, ip, sym_flags); 434 ret = seq_print_user_ip(s, mm, ip, sym_flags);
435 ret = trace_seq_puts(s, "\n");
400 } 436 }
401 437
402 if (mm) 438 if (mm)
@@ -666,6 +702,16 @@ int register_ftrace_event(struct trace_event *event)
666} 702}
667EXPORT_SYMBOL_GPL(register_ftrace_event); 703EXPORT_SYMBOL_GPL(register_ftrace_event);
668 704
705/*
706 * Used by module code with the trace_event_mutex held for write.
707 */
708int __unregister_ftrace_event(struct trace_event *event)
709{
710 hlist_del(&event->node);
711 list_del(&event->list);
712 return 0;
713}
714
669/** 715/**
670 * unregister_ftrace_event - remove a no longer used event 716 * unregister_ftrace_event - remove a no longer used event
671 * @event: the event to remove 717 * @event: the event to remove
@@ -673,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
673int unregister_ftrace_event(struct trace_event *event) 719int unregister_ftrace_event(struct trace_event *event)
674{ 720{
675 down_write(&trace_event_mutex); 721 down_write(&trace_event_mutex);
676 hlist_del(&event->node); 722 __unregister_ftrace_event(event);
677 list_del(&event->list);
678 up_write(&trace_event_mutex); 723 up_write(&trace_event_mutex);
679 724
680 return 0; 725 return 0;
@@ -972,16 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
972 1017
973 trace_assign_type(field, iter->ent); 1018 trace_assign_type(field, iter->ent);
974 1019
1020 if (!trace_seq_puts(s, "<stack trace>\n"))
1021 goto partial;
975 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1022 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
976 if (!field->caller[i]) 1023 if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
977 break; 1024 break;
978 if (i) { 1025 if (!trace_seq_puts(s, " => "))
979 if (!trace_seq_puts(s, " <= ")) 1026 goto partial;
980 goto partial;
981 1027
982 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1028 if (!seq_print_ip_sym(s, field->caller[i], flags))
983 goto partial; 1029 goto partial;
984 }
985 if (!trace_seq_puts(s, "\n")) 1030 if (!trace_seq_puts(s, "\n"))
986 goto partial; 1031 goto partial;
987 } 1032 }
@@ -1009,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1009 1054
1010 trace_assign_type(field, iter->ent); 1055 trace_assign_type(field, iter->ent);
1011 1056
1012 if (!seq_print_userip_objs(field, s, flags)) 1057 if (!trace_seq_puts(s, "<user stack trace>\n"))
1013 goto partial; 1058 goto partial;
1014 1059
1015 if (!trace_seq_putc(s, '\n')) 1060 if (!seq_print_userip_objs(field, s, flags))
1016 goto partial; 1061 goto partial;
1017 1062
1018 return TRACE_TYPE_HANDLED; 1063 return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ac240e76eb01..d38bec4a9c30 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type);
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29 29
30/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event);
32extern struct rw_semaphore trace_event_mutex;
33
30#define MAX_MEMHEX_BYTES 8 34#define MAX_MEMHEX_BYTES 8
31#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 35#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
32 36
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1796f00524e1..2d7aebd71dbd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
265 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
266 " (%d entries)\n" 266 " (%d entries)\n"
267 " ----- ---- --------\n", 267 " ----- ---- --------\n",
268 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries - 1);
269 269
270 if (!stack_tracer_enabled && !max_stack_size) 270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m); 271 print_disabled(m);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index e04b76cc238a..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 205
206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
207 HRTIMER_MODE_REL_PINNED);
207} 208}
208 209
209static void start_stack_timers(void) 210static void start_stack_timers(void)
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 75 put_user_ns(up->user_ns);
76} 76}
77 77
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
79{
80 struct user_struct *user;
81 struct hlist_node *h;
82
83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) {
85 atomic_inc(&user->__count);
86 return user;
87 }
88 }
89
90 return NULL;
91}
92
93#ifdef CONFIG_USER_SCHED 78#ifdef CONFIG_USER_SCHED
94 79
95static void sched_destroy_user(struct user_struct *up) 80static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
119 104
120#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) 105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
121 106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
122static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
123static DEFINE_MUTEX(uids_mutex); 125static DEFINE_MUTEX(uids_mutex);
124 126
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
283 return uids_user_create(&root_user); 285 return uids_user_create(&root_user);
284} 286}
285 287
286/* work function to remove sysfs directory for a user and free up 288/* delayed work function to remove sysfs directory for a user and free up
287 * corresponding structures. 289 * corresponding structures.
288 */ 290 */
289static void cleanup_user_struct(struct work_struct *w) 291static void cleanup_user_struct(struct work_struct *w)
290{ 292{
291 struct user_struct *up = container_of(w, struct user_struct, work); 293 struct user_struct *up = container_of(w, struct user_struct, work.work);
292 unsigned long flags; 294 unsigned long flags;
293 int remove_user = 0; 295 int remove_user = 0;
294 296
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
297 */ 299 */
298 uids_mutex_lock(); 300 uids_mutex_lock();
299 301
300 local_irq_save(flags); 302 spin_lock_irqsave(&uidhash_lock, flags);
301 303 if (atomic_read(&up->__count) == 0) {
302 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
303 uid_hash_remove(up); 304 uid_hash_remove(up);
304 remove_user = 1; 305 remove_user = 1;
305 spin_unlock_irqrestore(&uidhash_lock, flags);
306 } else {
307 local_irq_restore(flags);
308 } 306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
309 308
310 if (!remove_user) 309 if (!remove_user)
311 goto done; 310 goto done;
@@ -331,16 +330,28 @@ done:
331 */ 330 */
332static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
333{ 332{
334 /* restore back the count */
335 atomic_inc(&up->__count);
336 spin_unlock_irqrestore(&uidhash_lock, flags); 333 spin_unlock_irqrestore(&uidhash_lock, flags);
337 334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, cleanup_user_struct); 335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
339 schedule_work(&up->work);
340} 336}
341 337
342#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
343 339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{
342 struct user_struct *user;
343 struct hlist_node *h;
344
345 hlist_for_each_entry(user, h, hashent, uidhash_node) {
346 if (user->uid == uid) {
347 atomic_inc(&user->__count);
348 return user;
349 }
350 }
351
352 return NULL;
353}
354
344int uids_sysfs_init(void) { return 0; } 355int uids_sysfs_init(void) { return 0; }
345static inline int uids_user_create(struct user_struct *up) { return 0; } 356static inline int uids_user_create(struct user_struct *up) { return 0; }
346static inline void uids_mutex_lock(void) { } 357static inline void uids_mutex_lock(void) { }
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);