aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile21
-rw-r--r--kernel/acct.c46
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c7
-rw-r--r--kernel/auditsc.c22
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c467
-rw-r--r--kernel/cgroup_freezer.c88
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/cpu.c82
-rw-r--r--kernel/cpu_pm.c233
-rw-r--r--kernel/cpuset.c133
-rw-r--r--kernel/crash_dump.c13
-rw-r--r--kernel/cred.c20
-rw-r--r--kernel/debug/gdbstub.c12
-rw-r--r--kernel/debug/kdb/kdb_debugger.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c191
-rw-r--r--kernel/events/core.c419
-rw-r--r--kernel/events/internal.h42
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/exit.c45
-rw-r--r--kernel/fork.c48
-rw-r--r--kernel/freezer.c205
-rw-r--r--kernel/futex.c40
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c8
-rw-r--r--kernel/hung_task.c16
-rw-r--r--kernel/irq/chip.c64
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/internals.h21
-rw-r--r--kernel/irq/irqdesc.c34
-rw-r--r--kernel/irq/irqdomain.c27
-rw-r--r--kernel/irq/manage.c225
-rw-r--r--kernel/irq/pm.c48
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c95
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c87
-rw-r--r--kernel/kexec.c70
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kmod.c31
-rw-r--r--kernel/kprobes.c38
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c29
-rw-r--r--kernel/latencytop.c16
-rw-r--r--kernel/lockdep.c331
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/module.c261
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c45
-rw-r--r--kernel/params.c61
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/posix-cpu-timers.c146
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile4
-rw-r--r--kernel/power/console.c4
-rw-r--r--kernel/power/hibernate.c169
-rw-r--r--kernel/power/main.c116
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/process.c105
-rw-r--r--kernel/power/qos.c (renamed from kernel/pm_qos_params.c)278
-rw-r--r--kernel/power/snapshot.c24
-rw-r--r--kernel/power/suspend.c21
-rw-r--r--kernel/power/swap.c819
-rw-r--r--kernel/power/user.c185
-rw-r--r--kernel/printk.c76
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcu.h92
-rw-r--r--kernel/rcupdate.c40
-rw-r--r--kernel/rcutiny.c265
-rw-r--r--kernel/rcutiny_plugin.h164
-rw-r--r--kernel/rcutorture.c302
-rw-r--r--kernel/rcutree.c572
-rw-r--r--kernel/rcutree.h43
-rw-r--r--kernel/rcutree_plugin.h419
-rw-r--r--kernel/rcutree_trace.c25
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex-debug.c80
-rw-r--r--kernel/rtmutex-tester.c39
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)2
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2541
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)93
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)7
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1862
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)34
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)320
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)121
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)6
-rw-r--r--kernel/semaphore.c30
-rw-r--r--kernel/signal.c108
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stacktrace.c2
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys.c133
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c36
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c268
-rw-r--r--kernel/time/clockevents.c130
-rw-r--r--kernel/time/clocksource.c149
-rw-r--r--kernel/time/posix-clock.c1
-rw-r--r--kernel/time/tick-broadcast.c6
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-oneshot.c77
-rw-r--r--kernel/time/tick-sched.c166
-rw-r--r--kernel/time/timekeeping.c94
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/timer.c66
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/ring_buffer.c122
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c299
-rw-r--r--kernel/trace/trace.h20
-rw-r--r--kernel/trace/trace_clock.c12
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_events_filter.c834
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_irqsoff.c23
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/trace/trace_syscalls.c1
-rw-r--r--kernel/tracepoint.c169
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c25
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c34
166 files changed, 11304 insertions, 5942 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eca595e2fd52..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26
27obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,8 +99,8 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 102obj-$(CONFIG_IRQ_WORK) += irq_work.o
103obj-$(CONFIG_CPU_PM) += cpu_pm.o
104 104
105obj-$(CONFIG_PERF_EVENTS) += events/ 105obj-$(CONFIG_PERF_EVENTS) += events/
106 106
@@ -109,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
111 111
112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
114# needed for x86 only. Why this used to be enabled for all architectures is beyond
115# me. I suspect most platforms don't need this, but until we know that for sure
116# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
117# to get a correct value for the wait-channel (WCHAN in ps). --davidm
118CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
119endif
120
121$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
122 113
123# config_data.h contains the same information as ikconfig.h but gzipped. 114# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
84 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
85 */ 85 */
86struct bsd_acct_struct { 86struct bsd_acct_struct {
87 volatile int active; 87 int active;
88 volatile int needcheck; 88 unsigned long needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer;
92 struct list_head list; 91 struct list_head list;
93}; 92};
94 93
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
96static LIST_HEAD(acct_list); 95static LIST_HEAD(acct_list);
97 96
98/* 97/*
99 * Called whenever the timer says to check the free space.
100 */
101static void acct_timeout(unsigned long x)
102{
103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
105}
106
107/*
108 * Check the amount of free space and suspend/resume accordingly. 98 * Check the amount of free space and suspend/resume accordingly.
109 */ 99 */
110static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 100static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
112 struct kstatfs sbuf; 102 struct kstatfs sbuf;
113 int res; 103 int res;
114 int act; 104 int act;
115 sector_t resume; 105 u64 resume;
116 sector_t suspend; 106 u64 suspend;
117 107
118 spin_lock(&acct_lock); 108 spin_lock(&acct_lock);
119 res = acct->active; 109 res = acct->active;
120 if (!file || !acct->needcheck) 110 if (!file || time_is_before_jiffies(acct->needcheck))
121 goto out; 111 goto out;
122 spin_unlock(&acct_lock); 112 spin_unlock(&acct_lock);
123 113
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
127 suspend = sbuf.f_blocks * SUSPEND; 117 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 118 resume = sbuf.f_blocks * RESUME;
129 119
130 sector_div(suspend, 100); 120 do_div(suspend, 100);
131 sector_div(resume, 100); 121 do_div(resume, 100);
132 122
133 if (sbuf.f_bavail <= suspend) 123 if (sbuf.f_bavail <= suspend)
134 act = -1; 124 act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
160 } 150 }
161 } 151 }
162 152
163 del_timer(&acct->timer); 153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
164 acct->needcheck = 0;
165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
166 add_timer(&acct->timer);
167 res = acct->active; 154 res = acct->active;
168out: 155out:
169 spin_unlock(&acct_lock); 156 spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
185 if (acct->file) { 172 if (acct->file) {
186 old_acct = acct->file; 173 old_acct = acct->file;
187 old_ns = acct->ns; 174 old_ns = acct->ns;
188 del_timer(&acct->timer);
189 acct->active = 0; 175 acct->active = 0;
190 acct->needcheck = 0;
191 acct->file = NULL; 176 acct->file = NULL;
192 acct->ns = NULL; 177 acct->ns = NULL;
193 list_del(&acct->list); 178 list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
195 if (file) { 180 if (file) {
196 acct->file = file; 181 acct->file = file;
197 acct->ns = ns; 182 acct->ns = ns;
198 acct->needcheck = 0; 183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
199 acct->active = 1; 184 acct->active = 1;
200 list_add(&acct->list, &acct_list); 185 list_add(&acct->list, &acct_list);
201 /* It's been deleted if it was used before so this is safe */
202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
204 add_timer(&acct->timer);
205 } 186 }
206 if (old_acct) { 187 if (old_acct) {
207 mnt_unpin(old_acct->f_path.mnt); 188 mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
334 spin_lock(&acct_lock); 315 spin_lock(&acct_lock);
335restart: 316restart:
336 list_for_each_entry(acct, &acct_list, list) 317 list_for_each_entry(acct, &acct_list, list)
337 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { 318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
338 acct_file_reopen(acct, NULL, NULL); 319 acct_file_reopen(acct, NULL, NULL);
339 goto restart; 320 goto restart;
340 } 321 }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
348 if (acct == NULL) 329 if (acct == NULL)
349 return; 330 return;
350 331
351 del_timer_sync(&acct->timer);
352 spin_lock(&acct_lock); 332 spin_lock(&acct_lock);
353 if (acct->file != NULL) 333 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL); 334 acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
498 * Fill the accounting struct with the needed info as recorded 478 * Fill the accounting struct with the needed info as recorded
499 * by the different kernel functions. 479 * by the different kernel functions.
500 */ 480 */
501 memset((caddr_t)&ac, 0, sizeof(acct_t)); 481 memset(&ac, 0, sizeof(acct_t));
502 482
503 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
504 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 593 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 594 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 595 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 596 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 597 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 598 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 599 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 600 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2e..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h> 52#include <linux/atomic.h>
53#include <linux/ktime.h> 53#include <linux/ktime.h>
54#include <linux/module.h> 54#include <linux/export.h>
55#include <linux/wait.h> 55#include <linux/wait.h>
56#include <linux/sched.h> 56#include <linux/sched.h>
57#include <linux/slab.h> 57#include <linux/slab.h>
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
78 78
79static atomic_t entry_count; 79static atomic_t entry_count;
80 80
81extern int initcall_debug;
82
83 81
84/* 82/*
85 * MUST be called with the lock held! 83 * MUST be called with the lock held!
@@ -120,7 +118,7 @@ static void async_run_entry_fn(struct work_struct *work)
120 struct async_entry *entry = 118 struct async_entry *entry =
121 container_of(work, struct async_entry, work); 119 container_of(work, struct async_entry, work);
122 unsigned long flags; 120 unsigned long flags;
123 ktime_t calltime, delta, rettime; 121 ktime_t uninitialized_var(calltime), delta, rettime;
124 122
125 /* 1) move self to the running queue */ 123 /* 1) move self to the running queue */
126 spin_lock_irqsave(&async_lock, flags); 124 spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +267,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
269void async_synchronize_cookie_domain(async_cookie_t cookie, 267void async_synchronize_cookie_domain(async_cookie_t cookie,
270 struct list_head *running) 268 struct list_head *running)
271{ 269{
272 ktime_t starttime, delta, endtime; 270 ktime_t uninitialized_var(starttime), delta, endtime;
273 271
274 if (initcall_debug && system_state == SYSTEM_BOOTING) { 272 if (initcall_debug && system_state == SYSTEM_BOOTING) {
275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 273 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index f3ba55fa0b70..57e3f5107937 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
45#include <asm/types.h> 45#include <asm/types.h>
46#include <linux/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/export.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/err.h> 50#include <linux/err.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1260 avail = audit_expand(ab, 1260 avail = audit_expand(ab,
1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1262 if (!avail) 1262 if (!avail)
1263 goto out; 1263 goto out_va_end;
1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1265 } 1265 }
1266 va_end(args2);
1267 if (len > 0) 1266 if (len > 0)
1268 skb_put(skb, len); 1267 skb_put(skb, len);
1268out_va_end:
1269 va_end(args2);
1269out: 1270out:
1270 return; 1271 return;
1271} 1272}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce4b054acee5..e7fe2b0d29b3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/export.h>
52#include <linux/slab.h> 52#include <linux/slab.h>
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/socket.h> 54#include <linux/socket.h>
@@ -210,12 +210,12 @@ struct audit_context {
210 struct { 210 struct {
211 uid_t uid; 211 uid_t uid;
212 gid_t gid; 212 gid_t gid;
213 mode_t mode; 213 umode_t mode;
214 u32 osid; 214 u32 osid;
215 int has_perm; 215 int has_perm;
216 uid_t perm_uid; 216 uid_t perm_uid;
217 gid_t perm_gid; 217 gid_t perm_gid;
218 mode_t perm_mode; 218 umode_t perm_mode;
219 unsigned long qbytes; 219 unsigned long qbytes;
220 } ipc; 220 } ipc;
221 struct { 221 struct {
@@ -234,7 +234,7 @@ struct audit_context {
234 } mq_sendrecv; 234 } mq_sendrecv;
235 struct { 235 struct {
236 int oflag; 236 int oflag;
237 mode_t mode; 237 umode_t mode;
238 struct mq_attr attr; 238 struct mq_attr attr;
239 } mq_open; 239 } mq_open;
240 struct { 240 struct {
@@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
308static int audit_match_filetype(struct audit_context *ctx, int which) 308static int audit_match_filetype(struct audit_context *ctx, int which)
309{ 309{
310 unsigned index = which & ~S_IFMT; 310 unsigned index = which & ~S_IFMT;
311 mode_t mode = which & S_IFMT; 311 umode_t mode = which & S_IFMT;
312 312
313 if (unlikely(!ctx)) 313 if (unlikely(!ctx))
314 return 0; 314 return 0;
@@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1249 case AUDIT_IPC: { 1249 case AUDIT_IPC: {
1250 u32 osid = context->ipc.osid; 1250 u32 osid = context->ipc.osid;
1251 1251
1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", 1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1253 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1253 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1254 if (osid) { 1254 if (osid) {
1255 char *ctx = NULL; 1255 char *ctx = NULL;
@@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1267 ab = audit_log_start(context, GFP_KERNEL, 1267 ab = audit_log_start(context, GFP_KERNEL,
1268 AUDIT_IPC_SET_PERM); 1268 AUDIT_IPC_SET_PERM);
1269 audit_log_format(ab, 1269 audit_log_format(ab,
1270 "qbytes=%lx ouid=%u ogid=%u mode=%#o", 1270 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1271 context->ipc.qbytes, 1271 context->ipc.qbytes,
1272 context->ipc.perm_uid, 1272 context->ipc.perm_uid,
1273 context->ipc.perm_gid, 1273 context->ipc.perm_gid,
@@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1278 break; } 1278 break; }
1279 case AUDIT_MQ_OPEN: { 1279 case AUDIT_MQ_OPEN: {
1280 audit_log_format(ab, 1280 audit_log_format(ab,
1281 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " 1281 "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
1282 "mq_msgsize=%ld mq_curmsgs=%ld", 1282 "mq_msgsize=%ld mq_curmsgs=%ld",
1283 context->mq_open.oflag, context->mq_open.mode, 1283 context->mq_open.oflag, context->mq_open.mode,
1284 context->mq_open.attr.mq_flags, 1284 context->mq_open.attr.mq_flags,
@@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1502 1502
1503 if (n->ino != (unsigned long)-1) { 1503 if (n->ino != (unsigned long)-1) {
1504 audit_log_format(ab, " inode=%lu" 1504 audit_log_format(ab, " inode=%lu"
1505 " dev=%02x:%02x mode=%#o" 1505 " dev=%02x:%02x mode=%#ho"
1506 " ouid=%u ogid=%u rdev=%02x:%02x", 1506 " ouid=%u ogid=%u rdev=%02x:%02x",
1507 n->ino, 1507 n->ino,
1508 MAJOR(n->dev), 1508 MAJOR(n->dev),
@@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2160 * @attr: queue attributes 2160 * @attr: queue attributes
2161 * 2161 *
2162 */ 2162 */
2163void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2163void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
2164{ 2164{
2165 struct audit_context *context = current->audit_context; 2165 struct audit_context *context = current->audit_context;
2166 2166
@@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2260 * 2260 *
2261 * Called only after audit_ipc_obj(). 2261 * Called only after audit_ipc_obj().
2262 */ 2262 */
2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
2264{ 2264{
2265 struct audit_context *context = current->audit_context; 2265 struct audit_context *context = current->audit_context;
2266 2266
diff --git a/kernel/capability.c b/kernel/capability.c
index 74fb3b603045..0fcf1c14a297 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
10#include <linux/audit.h> 10#include <linux/audit.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95d..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it.
69 *
70 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
71 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
72 * release_agent_path and so on. Modifying requires both cgroup_mutex and
73 * cgroup_root_mutex. Readers can acquire either of the two. This is to
74 * break the following locking order cycle.
75 *
76 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
77 * B. namespace_sem -> cgroup_mutex
78 *
79 * B happens only through cgroup_show_options() and using cgroup_root_mutex
80 * breaks it.
81 */
66static DEFINE_MUTEX(cgroup_mutex); 82static DEFINE_MUTEX(cgroup_mutex);
83static DEFINE_MUTEX(cgroup_root_mutex);
67 84
68/* 85/*
69 * Generate an array of cgroup subsystem pointers. At boot time, this is 86 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -265,7 +282,7 @@ list_for_each_entry(_root, &roots, root_list)
265/* the list of cgroups eligible for automatic release. Protected by 282/* the list of cgroups eligible for automatic release. Protected by
266 * release_list_lock */ 283 * release_list_lock */
267static LIST_HEAD(release_list); 284static LIST_HEAD(release_list);
268static DEFINE_SPINLOCK(release_list_lock); 285static DEFINE_RAW_SPINLOCK(release_list_lock);
269static void cgroup_release_agent(struct work_struct *work); 286static void cgroup_release_agent(struct work_struct *work);
270static DECLARE_WORK(release_agent_work, cgroup_release_agent); 287static DECLARE_WORK(release_agent_work, cgroup_release_agent);
271static void check_for_release(struct cgroup *cgrp); 288static void check_for_release(struct cgroup *cgrp);
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 * -> cgroup_mkdir. 777 * -> cgroup_mkdir.
761 */ 778 */
762 779
763static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 780static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
764static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 781static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
765static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 782static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
766static int cgroup_populate_dir(struct cgroup *cgrp); 783static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
775static int alloc_css_id(struct cgroup_subsys *ss, 792static int alloc_css_id(struct cgroup_subsys *ss,
776 struct cgroup *parent, struct cgroup *child); 793 struct cgroup *parent, struct cgroup *child);
777 794
778static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 795static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
779{ 796{
780 struct inode *inode = new_inode(sb); 797 struct inode *inode = new_inode(sb);
781 798
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
921 * 938 *
922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 939 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
923 */ 940 */
924DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 941static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
925 942
926static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 943static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
927{ 944{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
953 int i; 970 int i;
954 971
955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 972 BUG_ON(!mutex_is_locked(&cgroup_mutex));
973 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
956 974
957 removed_bits = root->actual_subsys_bits & ~final_bits; 975 removed_bits = root->actual_subsys_bits & ~final_bits;
958 added_bits = final_bits & ~root->actual_subsys_bits; 976 added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1038 return 0; 1056 return 0;
1039} 1057}
1040 1058
1041static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 1059static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1042{ 1060{
1043 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 1061 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1044 struct cgroup_subsys *ss; 1062 struct cgroup_subsys *ss;
1045 1063
1046 mutex_lock(&cgroup_mutex); 1064 mutex_lock(&cgroup_root_mutex);
1047 for_each_subsys(root, ss) 1065 for_each_subsys(root, ss)
1048 seq_printf(seq, ",%s", ss->name); 1066 seq_printf(seq, ",%s", ss->name);
1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1067 if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1054 seq_puts(seq, ",clone_children"); 1072 seq_puts(seq, ",clone_children");
1055 if (strlen(root->name)) 1073 if (strlen(root->name))
1056 seq_printf(seq, ",name=%s", root->name); 1074 seq_printf(seq, ",name=%s", root->name);
1057 mutex_unlock(&cgroup_mutex); 1075 mutex_unlock(&cgroup_root_mutex);
1058 return 0; 1076 return 0;
1059} 1077}
1060 1078
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1175 1193
1176 /* 1194 /*
1177 * If the 'all' option was specified select all the subsystems, 1195 * If the 'all' option was specified select all the subsystems,
1178 * otherwise 'all, 'none' and a subsystem name options were not 1196 * otherwise if 'none', 'name=' and a subsystem name options
1179 * specified, let's default to 'all' 1197 * were not specified, let's default to 'all'
1180 */ 1198 */
1181 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1199 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1200 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1183 struct cgroup_subsys *ss = subsys[i]; 1201 struct cgroup_subsys *ss = subsys[i];
1184 if (ss == NULL) 1202 if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1269 1287
1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1288 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1271 mutex_lock(&cgroup_mutex); 1289 mutex_lock(&cgroup_mutex);
1290 mutex_lock(&cgroup_root_mutex);
1272 1291
1273 /* See what subsystems are wanted */ 1292 /* See what subsystems are wanted */
1274 ret = parse_cgroupfs_options(data, &opts); 1293 ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1297 out_unlock: 1316 out_unlock:
1298 kfree(opts.release_agent); 1317 kfree(opts.release_agent);
1299 kfree(opts.name); 1318 kfree(opts.name);
1319 mutex_unlock(&cgroup_root_mutex);
1300 mutex_unlock(&cgroup_mutex); 1320 mutex_unlock(&cgroup_mutex);
1301 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1302 return ret; 1322 return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1481 int ret = 0; 1501 int ret = 0;
1482 struct super_block *sb; 1502 struct super_block *sb;
1483 struct cgroupfs_root *new_root; 1503 struct cgroupfs_root *new_root;
1504 struct inode *inode;
1484 1505
1485 /* First find the desired set of subsystems */ 1506 /* First find the desired set of subsystems */
1486 mutex_lock(&cgroup_mutex); 1507 mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 /* We used the new root structure, so this is a new hierarchy */ 1535 /* We used the new root structure, so this is a new hierarchy */
1515 struct list_head tmp_cg_links; 1536 struct list_head tmp_cg_links;
1516 struct cgroup *root_cgrp = &root->top_cgroup; 1537 struct cgroup *root_cgrp = &root->top_cgroup;
1517 struct inode *inode;
1518 struct cgroupfs_root *existing_root; 1538 struct cgroupfs_root *existing_root;
1519 const struct cred *cred; 1539 const struct cred *cred;
1520 int i; 1540 int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1528 1548
1529 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
1530 mutex_lock(&cgroup_mutex); 1550 mutex_lock(&cgroup_mutex);
1551 mutex_lock(&cgroup_root_mutex);
1531 1552
1532 if (strlen(root->name)) { 1553 /* Check for name clashes with existing mounts */
1533 /* Check for name clashes with existing mounts */ 1554 ret = -EBUSY;
1534 for_each_active_root(existing_root) { 1555 if (strlen(root->name))
1535 if (!strcmp(existing_root->name, root->name)) { 1556 for_each_active_root(existing_root)
1536 ret = -EBUSY; 1557 if (!strcmp(existing_root->name, root->name))
1537 mutex_unlock(&cgroup_mutex); 1558 goto unlock_drop;
1538 mutex_unlock(&inode->i_mutex);
1539 goto drop_new_super;
1540 }
1541 }
1542 }
1543 1559
1544 /* 1560 /*
1545 * We're accessing css_set_count without locking 1561 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1549 * have some link structures left over 1565 * have some link structures left over
1550 */ 1566 */
1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1567 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1552 if (ret) { 1568 if (ret)
1553 mutex_unlock(&cgroup_mutex); 1569 goto unlock_drop;
1554 mutex_unlock(&inode->i_mutex);
1555 goto drop_new_super;
1556 }
1557 1570
1558 ret = rebind_subsystems(root, root->subsys_bits); 1571 ret = rebind_subsystems(root, root->subsys_bits);
1559 if (ret == -EBUSY) { 1572 if (ret == -EBUSY) {
1560 mutex_unlock(&cgroup_mutex);
1561 mutex_unlock(&inode->i_mutex);
1562 free_cg_links(&tmp_cg_links); 1573 free_cg_links(&tmp_cg_links);
1563 goto drop_new_super; 1574 goto unlock_drop;
1564 } 1575 }
1565 /* 1576 /*
1566 * There must be no failure case after here, since rebinding 1577 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1599 cred = override_creds(&init_cred); 1610 cred = override_creds(&init_cred);
1600 cgroup_populate_dir(root_cgrp); 1611 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred); 1612 revert_creds(cred);
1613 mutex_unlock(&cgroup_root_mutex);
1602 mutex_unlock(&cgroup_mutex); 1614 mutex_unlock(&cgroup_mutex);
1603 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1604 } else { 1616 } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1615 kfree(opts.name); 1627 kfree(opts.name);
1616 return dget(sb->s_root); 1628 return dget(sb->s_root);
1617 1629
1630 unlock_drop:
1631 mutex_unlock(&cgroup_root_mutex);
1632 mutex_unlock(&cgroup_mutex);
1633 mutex_unlock(&inode->i_mutex);
1618 drop_new_super: 1634 drop_new_super:
1619 deactivate_locked_super(sb); 1635 deactivate_locked_super(sb);
1620 drop_modules: 1636 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1639 BUG_ON(!list_empty(&cgrp->sibling)); 1655 BUG_ON(!list_empty(&cgrp->sibling));
1640 1656
1641 mutex_lock(&cgroup_mutex); 1657 mutex_lock(&cgroup_mutex);
1658 mutex_lock(&cgroup_root_mutex);
1642 1659
1643 /* Rebind all subsystems back to the default hierarchy */ 1660 /* Rebind all subsystems back to the default hierarchy */
1644 ret = rebind_subsystems(root, 0); 1661 ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1664 root_count--; 1681 root_count--;
1665 } 1682 }
1666 1683
1684 mutex_unlock(&cgroup_root_mutex);
1667 mutex_unlock(&cgroup_mutex); 1685 mutex_unlock(&cgroup_mutex);
1668 1686
1669 kill_litter_super(sb); 1687 kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1740EXPORT_SYMBOL_GPL(cgroup_path); 1758EXPORT_SYMBOL_GPL(cgroup_path);
1741 1759
1742/* 1760/*
1761 * Control Group taskset
1762 */
1763struct task_and_cgroup {
1764 struct task_struct *task;
1765 struct cgroup *cgrp;
1766};
1767
1768struct cgroup_taskset {
1769 struct task_and_cgroup single;
1770 struct flex_array *tc_array;
1771 int tc_array_len;
1772 int idx;
1773 struct cgroup *cur_cgrp;
1774};
1775
1776/**
1777 * cgroup_taskset_first - reset taskset and return the first task
1778 * @tset: taskset of interest
1779 *
1780 * @tset iteration is initialized and the first task is returned.
1781 */
1782struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1783{
1784 if (tset->tc_array) {
1785 tset->idx = 0;
1786 return cgroup_taskset_next(tset);
1787 } else {
1788 tset->cur_cgrp = tset->single.cgrp;
1789 return tset->single.task;
1790 }
1791}
1792EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1793
1794/**
1795 * cgroup_taskset_next - iterate to the next task in taskset
1796 * @tset: taskset of interest
1797 *
1798 * Return the next task in @tset. Iteration must have been initialized
1799 * with cgroup_taskset_first().
1800 */
1801struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1802{
1803 struct task_and_cgroup *tc;
1804
1805 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1806 return NULL;
1807
1808 tc = flex_array_get(tset->tc_array, tset->idx++);
1809 tset->cur_cgrp = tc->cgrp;
1810 return tc->task;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1813
1814/**
1815 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1816 * @tset: taskset of interest
1817 *
1818 * Return the cgroup for the current (last returned) task of @tset. This
1819 * function must be preceded by either cgroup_taskset_first() or
1820 * cgroup_taskset_next().
1821 */
1822struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1823{
1824 return tset->cur_cgrp;
1825}
1826EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1827
1828/**
1829 * cgroup_taskset_size - return the number of tasks in taskset
1830 * @tset: taskset of interest
1831 */
1832int cgroup_taskset_size(struct cgroup_taskset *tset)
1833{
1834 return tset->tc_array ? tset->tc_array_len : 1;
1835}
1836EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1837
1838
1839/*
1743 * cgroup_task_migrate - move a task from one cgroup to another. 1840 * cgroup_task_migrate - move a task from one cgroup to another.
1744 * 1841 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task 1842 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with 1843 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1748 */ 1845 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee) 1847 struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1753 struct css_set *newcg; 1850 struct css_set *newcg;
1754 1851
1755 /* 1852 /*
1756 * get old css_set. we need to take task_lock and refcount it, because 1853 * We are synchronized through threadgroup_lock() against PF_EXITING
1757 * an exiting task can change its css_set to init_css_set and drop its 1854 * setting such that we can't race against cgroup_exit() changing the
1758 * old one without taking cgroup_mutex. 1855 * css_set to init_css_set and dropping the old one.
1759 */ 1856 */
1760 task_lock(tsk); 1857 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1761 oldcg = tsk->cgroups; 1858 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764 1859
1765 /* locate or allocate a new css_set for this task. */ 1860 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) { 1861 if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1775 might_sleep(); 1870 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */ 1871 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp); 1872 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) { 1873 if (!newcg)
1779 put_css_set(oldcg);
1780 return -ENOMEM; 1874 return -ENOMEM;
1781 }
1782 } 1875 }
1783 put_css_set(oldcg);
1784 1876
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk); 1877 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg); 1878 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk); 1879 task_unlock(tsk);
1794 1880
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1814 * @cgrp: the cgroup the task is attaching to 1900 * @cgrp: the cgroup the task is attaching to
1815 * @tsk: the task to be attached 1901 * @tsk: the task to be attached
1816 * 1902 *
1817 * Call holding cgroup_mutex. May take task_lock of 1903 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1818 * the task 'tsk' during call. 1904 * @tsk during call.
1819 */ 1905 */
1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1821{ 1907{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1909 struct cgroup_subsys *ss, *failed_ss = NULL;
1824 struct cgroup *oldcgrp; 1910 struct cgroup *oldcgrp;
1825 struct cgroupfs_root *root = cgrp->root; 1911 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { };
1913
1914 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING)
1916 return -ESRCH;
1826 1917
1827 /* Nothing to do if the task is already in that cgroup */ 1918 /* Nothing to do if the task is already in that cgroup */
1828 oldcgrp = task_cgroup_from_root(tsk, root); 1919 oldcgrp = task_cgroup_from_root(tsk, root);
1829 if (cgrp == oldcgrp) 1920 if (cgrp == oldcgrp)
1830 return 0; 1921 return 0;
1831 1922
1923 tset.single.task = tsk;
1924 tset.single.cgrp = oldcgrp;
1925
1832 for_each_subsys(root, ss) { 1926 for_each_subsys(root, ss) {
1833 if (ss->can_attach) { 1927 if (ss->can_attach) {
1834 retval = ss->can_attach(ss, cgrp, tsk); 1928 retval = ss->can_attach(ss, cgrp, &tset);
1835 if (retval) { 1929 if (retval) {
1836 /* 1930 /*
1837 * Remember on which subsystem the can_attach() 1931 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1843 goto out; 1937 goto out;
1844 } 1938 }
1845 } 1939 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1853 } 1940 }
1854 1941
1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1857 goto out; 1944 goto out;
1858 1945
1859 for_each_subsys(root, ss) { 1946 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1864 if (ss->attach) 1947 if (ss->attach)
1865 ss->attach(ss, cgrp, oldcgrp, tsk); 1948 ss->attach(ss, cgrp, &tset);
1866 } 1949 }
1867 1950
1868 synchronize_rcu(); 1951 synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
1884 */ 1967 */
1885 break; 1968 break;
1886 if (ss->cancel_attach) 1969 if (ss->cancel_attach)
1887 ss->cancel_attach(ss, cgrp, tsk); 1970 ss->cancel_attach(ss, cgrp, &tset);
1888 } 1971 }
1889 } 1972 }
1890 return retval; 1973 return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
1935 2018
1936 read_lock(&css_set_lock); 2019 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template); 2020 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock); 2021 read_unlock(&css_set_lock);
1941 2022
1942 /* doesn't exist at all? */ 2023 /* doesn't exist at all? */
1943 if (!newcg) 2024 if (!newcg)
1944 return false; 2025 return false;
1945 /* see if it's already in the list */ 2026 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) { 2027 list_for_each_entry(cg_entry, newcg_list, links)
1947 if (cg_entry->cg == newcg) { 2028 if (cg_entry->cg == newcg)
1948 put_css_set(newcg);
1949 return true; 2029 return true;
1950 }
1951 }
1952 2030
1953 /* not found */ 2031 /* not found */
1954 put_css_set(newcg);
1955 return false; 2032 return false;
1956} 2033}
1957 2034
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1985 * @cgrp: the cgroup to attach to 2062 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached 2063 * @leader: the threadgroup leader task_struct of the group to be attached
1987 * 2064 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2065 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1989 * take task_lock of each thread in leader's threadgroup individually in turn. 2066 * task_lock of each thread in leader's threadgroup individually in turn.
1990 */ 2067 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2068static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{ 2069{
1993 int retval, i, group_size; 2070 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2071 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */ 2072 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg; 2073 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root; 2074 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */ 2075 /* threadgroup list cursor and array */
2001 struct task_struct *tsk; 2076 struct task_struct *tsk;
2077 struct task_and_cgroup *tc;
2002 struct flex_array *group; 2078 struct flex_array *group;
2079 struct cgroup_taskset tset = { };
2003 /* 2080 /*
2004 * we need to make sure we have css_sets for all the tasks we're 2081 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in 2082 * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2012 * step 0: in order to do expensive, possibly blocking operations for 2089 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs 2090 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the 2091 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing, 2092 * group - group_rwsem prevents new threads from appearing, and if
2016 * and if threads exit, this will just be an over-estimate. 2093 * threads exit, this will just be an over-estimate.
2017 */ 2094 */
2018 group_size = get_nr_threads(leader); 2095 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2096 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2097 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2021 GFP_KERNEL);
2022 if (!group) 2098 if (!group)
2023 return -ENOMEM; 2099 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2100 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2027,7 +2103,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2027 goto out_free_group_list; 2103 goto out_free_group_list;
2028 2104
2029 /* prevent changes to the threadgroup list while we take a snapshot. */ 2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2030 rcu_read_lock(); 2106 read_lock(&tasklist_lock);
2031 if (!thread_group_leader(leader)) { 2107 if (!thread_group_leader(leader)) {
2032 /* 2108 /*
2033 * a race with de_thread from another thread's exec() may strip 2109 * a race with de_thread from another thread's exec() may strip
@@ -2036,53 +2112,57 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2036 * throw this task away and try again (from cgroup_procs_write); 2112 * throw this task away and try again (from cgroup_procs_write);
2037 * this is "double-double-toil-and-trouble-check locking". 2113 * this is "double-double-toil-and-trouble-check locking".
2038 */ 2114 */
2039 rcu_read_unlock(); 2115 read_unlock(&tasklist_lock);
2040 retval = -EAGAIN; 2116 retval = -EAGAIN;
2041 goto out_free_group_list; 2117 goto out_free_group_list;
2042 } 2118 }
2043 /* take a reference on each task in the group to go in the array. */ 2119
2044 tsk = leader; 2120 tsk = leader;
2045 i = 0; 2121 i = 0;
2046 do { 2122 do {
2123 struct task_and_cgroup ent;
2124
2125 /* @tsk either already exited or can't exit until the end */
2126 if (tsk->flags & PF_EXITING)
2127 continue;
2128
2047 /* as per above, nr_threads may decrease, but not increase. */ 2129 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size); 2130 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /* 2131 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2133 * earlier, but it's good form to communicate our expectations.
2053 */ 2134 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2135 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp)
2139 continue;
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2141 BUG_ON(retval != 0);
2056 i++; 2142 i++;
2057 } while_each_thread(leader, tsk); 2143 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2144 /* remember the number of threads in the array for later. */
2059 group_size = i; 2145 group_size = i;
2060 rcu_read_unlock(); 2146 tset.tc_array = group;
2147 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149
2150 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0;
2152 if (!group_size)
2153 goto out_free_group_list;
2061 2154
2062 /* 2155 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2156 * step 1: check that we can legitimately attach to the cgroup.
2064 */ 2157 */
2065 for_each_subsys(root, ss) { 2158 for_each_subsys(root, ss) {
2066 if (ss->can_attach) { 2159 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader); 2160 retval = ss->can_attach(ss, cgrp, &tset);
2068 if (retval) { 2161 if (retval) {
2069 failed_ss = ss; 2162 failed_ss = ss;
2070 goto out_cancel_attach; 2163 goto out_cancel_attach;
2071 } 2164 }
2072 } 2165 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 } 2166 }
2087 2167
2088 /* 2168 /*
@@ -2091,69 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2091 */ 2171 */
2092 INIT_LIST_HEAD(&newcg_list); 2172 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) { 2173 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i); 2174 tc = flex_array_get(group, i);
2095 /* nothing to do if this task is already in the cgroup */ 2175 oldcg = tc->task->cgroups;
2096 oldcgrp = task_cgroup_from_root(tsk, root); 2176
2097 if (cgrp == oldcgrp) 2177 /* if we don't already have it in the list get a new one */
2098 continue; 2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg,
2099 /* get old css_set pointer */ 2179 &newcg_list)) {
2100 task_lock(tsk);
2101 if (tsk->flags & PF_EXITING) {
2102 /* ignore this task if it's going away */
2103 task_unlock(tsk);
2104 continue;
2105 }
2106 oldcg = tsk->cgroups;
2107 get_css_set(oldcg);
2108 task_unlock(tsk);
2109 /* see if the new one for us is already in the list? */
2110 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2111 /* was already there, nothing to do. */
2112 put_css_set(oldcg);
2113 } else {
2114 /* we don't already have it. get new one. */
2115 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2116 put_css_set(oldcg);
2117 if (retval) 2181 if (retval)
2118 goto out_list_teardown; 2182 goto out_list_teardown;
2119 } 2183 }
2120 } 2184 }
2121 2185
2122 /* 2186 /*
2123 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2187 * step 3: now that we're guaranteed success wrt the css_sets,
2124 * to move all tasks to the new cgroup, calling ss->attach_task for each 2188 * proceed to move all tasks to the new cgroup. There are no
2125 * one along the way. there are no failure cases after here, so this is 2189 * failure cases after here, so this is the commit point.
2126 * the commit point.
2127 */ 2190 */
2128 for_each_subsys(root, ss) {
2129 if (ss->pre_attach)
2130 ss->pre_attach(cgrp);
2131 }
2132 for (i = 0; i < group_size; i++) { 2191 for (i = 0; i < group_size; i++) {
2133 tsk = flex_array_get_ptr(group, i); 2192 tc = flex_array_get(group, i);
2134 /* leave current thread as it is if it's already there */ 2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
2135 oldcgrp = task_cgroup_from_root(tsk, root); 2194 BUG_ON(retval);
2136 if (cgrp == oldcgrp)
2137 continue;
2138 /* attach each task to each subsystem */
2139 for_each_subsys(root, ss) {
2140 if (ss->attach_task)
2141 ss->attach_task(cgrp, tsk);
2142 }
2143 /* if the thread is PF_EXITING, it can just get skipped. */
2144 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2145 BUG_ON(retval != 0 && retval != -ESRCH);
2146 } 2195 }
2147 /* nothing is sensitive to fork() after this point. */ 2196 /* nothing is sensitive to fork() after this point. */
2148 2197
2149 /* 2198 /*
2150 * step 4: do expensive, non-thread-specific subsystem callbacks. 2199 * step 4: do subsystem attach callbacks.
2151 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2152 * being moved, this call will need to be reworked to communicate that.
2153 */ 2200 */
2154 for_each_subsys(root, ss) { 2201 for_each_subsys(root, ss) {
2155 if (ss->attach) 2202 if (ss->attach)
2156 ss->attach(ss, cgrp, oldcgrp, leader); 2203 ss->attach(ss, cgrp, &tset);
2157 } 2204 }
2158 2205
2159 /* 2206 /*
@@ -2173,20 +2220,12 @@ out_cancel_attach:
2173 /* same deal as in cgroup_attach_task */ 2220 /* same deal as in cgroup_attach_task */
2174 if (retval) { 2221 if (retval) {
2175 for_each_subsys(root, ss) { 2222 for_each_subsys(root, ss) {
2176 if (ss == failed_ss) { 2223 if (ss == failed_ss)
2177 if (cancel_failed_ss && ss->cancel_attach)
2178 ss->cancel_attach(ss, cgrp, leader);
2179 break; 2224 break;
2180 }
2181 if (ss->cancel_attach) 2225 if (ss->cancel_attach)
2182 ss->cancel_attach(ss, cgrp, leader); 2226 ss->cancel_attach(ss, cgrp, &tset);
2183 } 2227 }
2184 } 2228 }
2185 /* clean up the array of referenced threads in the group. */
2186 for (i = 0; i < group_size; i++) {
2187 tsk = flex_array_get_ptr(group, i);
2188 put_task_struct(tsk);
2189 }
2190out_free_group_list: 2229out_free_group_list:
2191 flex_array_free(group); 2230 flex_array_free(group);
2192 return retval; 2231 return retval;
@@ -2194,8 +2233,8 @@ out_free_group_list:
2194 2233
2195/* 2234/*
2196 * Find the task_struct of the task to attach by vpid and pass it along to the 2235 * Find the task_struct of the task to attach by vpid and pass it along to the
2197 * function to attach either it or all tasks in its threadgroup. Will take 2236 * function to attach either it or all tasks in its threadgroup. Will lock
2198 * cgroup_mutex; may take task_lock of task. 2237 * cgroup_mutex and threadgroup; may take task_lock of task.
2199 */ 2238 */
2200static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2239static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2201{ 2240{
@@ -2222,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2222 * detect it later. 2261 * detect it later.
2223 */ 2262 */
2224 tsk = tsk->group_leader; 2263 tsk = tsk->group_leader;
2225 } else if (tsk->flags & PF_EXITING) {
2226 /* optimization for the single-task-only case */
2227 rcu_read_unlock();
2228 cgroup_unlock();
2229 return -ESRCH;
2230 } 2264 }
2231
2232 /* 2265 /*
2233 * even if we're attaching all tasks in the thread group, we 2266 * even if we're attaching all tasks in the thread group, we
2234 * only need to check permissions on one of them. 2267 * only need to check permissions on one of them.
@@ -2251,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2251 get_task_struct(tsk); 2284 get_task_struct(tsk);
2252 } 2285 }
2253 2286
2254 if (threadgroup) { 2287 threadgroup_lock(tsk);
2255 threadgroup_fork_write_lock(tsk); 2288
2289 if (threadgroup)
2256 ret = cgroup_attach_proc(cgrp, tsk); 2290 ret = cgroup_attach_proc(cgrp, tsk);
2257 threadgroup_fork_write_unlock(tsk); 2291 else
2258 } else {
2259 ret = cgroup_attach_task(cgrp, tsk); 2292 ret = cgroup_attach_task(cgrp, tsk);
2260 } 2293
2294 threadgroup_unlock(tsk);
2295
2261 put_task_struct(tsk); 2296 put_task_struct(tsk);
2262 cgroup_unlock(); 2297 cgroup_unlock();
2263 return ret; 2298 return ret;
@@ -2308,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2308 return -EINVAL; 2343 return -EINVAL;
2309 if (!cgroup_lock_live_group(cgrp)) 2344 if (!cgroup_lock_live_group(cgrp))
2310 return -ENODEV; 2345 return -ENODEV;
2346 mutex_lock(&cgroup_root_mutex);
2311 strcpy(cgrp->root->release_agent_path, buffer); 2347 strcpy(cgrp->root->release_agent_path, buffer);
2348 mutex_unlock(&cgroup_root_mutex);
2312 cgroup_unlock(); 2349 cgroup_unlock();
2313 return 0; 2350 return 0;
2314} 2351}
@@ -2587,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
2587 return __d_cft(file->f_dentry); 2624 return __d_cft(file->f_dentry);
2588} 2625}
2589 2626
2590static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2627static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2591 struct super_block *sb) 2628 struct super_block *sb)
2592{ 2629{
2593 struct inode *inode; 2630 struct inode *inode;
@@ -2628,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2628 * @mode: mode to set on new directory. 2665 * @mode: mode to set on new directory.
2629 */ 2666 */
2630static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 2667static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2631 mode_t mode) 2668 umode_t mode)
2632{ 2669{
2633 struct dentry *parent; 2670 struct dentry *parent;
2634 int error = 0; 2671 int error = 0;
@@ -2655,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2655 * returns S_IRUGO if it has only a read handler 2692 * returns S_IRUGO if it has only a read handler
2656 * returns S_IWUSR if it has only a write hander 2693 * returns S_IWUSR if it has only a write hander
2657 */ 2694 */
2658static mode_t cgroup_file_mode(const struct cftype *cft) 2695static umode_t cgroup_file_mode(const struct cftype *cft)
2659{ 2696{
2660 mode_t mode = 0; 2697 umode_t mode = 0;
2661 2698
2662 if (cft->mode) 2699 if (cft->mode)
2663 return cft->mode; 2700 return cft->mode;
@@ -2680,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2680 struct dentry *dir = cgrp->dentry; 2717 struct dentry *dir = cgrp->dentry;
2681 struct dentry *dentry; 2718 struct dentry *dentry;
2682 int error; 2719 int error;
2683 mode_t mode; 2720 umode_t mode;
2684 2721
2685 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2686 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2723 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2791,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
2791} 2828}
2792 2829
2793void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2830void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2831 __acquires(css_set_lock)
2794{ 2832{
2795 /* 2833 /*
2796 * The first time anyone tries to iterate across a cgroup, 2834 * The first time anyone tries to iterate across a cgroup,
@@ -2830,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2830} 2868}
2831 2869
2832void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2870void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2871 __releases(css_set_lock)
2833{ 2872{
2834 read_unlock(&css_set_lock); 2873 read_unlock(&css_set_lock);
2835} 2874}
@@ -3754,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3754 * Must be called with the mutex on the parent inode held 3793 * Must be called with the mutex on the parent inode held
3755 */ 3794 */
3756static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3795static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3757 mode_t mode) 3796 umode_t mode)
3758{ 3797{
3759 struct cgroup *cgrp; 3798 struct cgroup *cgrp;
3760 struct cgroupfs_root *root = parent->root; 3799 struct cgroupfs_root *root = parent->root;
@@ -3848,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3848 return err; 3887 return err;
3849} 3888}
3850 3889
3851static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 3890static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3852{ 3891{
3853 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3892 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3854 3893
@@ -4014,11 +4053,11 @@ again:
4014 finish_wait(&cgroup_rmdir_waitq, &wait); 4053 finish_wait(&cgroup_rmdir_waitq, &wait);
4015 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4054 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4016 4055
4017 spin_lock(&release_list_lock); 4056 raw_spin_lock(&release_list_lock);
4018 set_bit(CGRP_REMOVED, &cgrp->flags); 4057 set_bit(CGRP_REMOVED, &cgrp->flags);
4019 if (!list_empty(&cgrp->release_list)) 4058 if (!list_empty(&cgrp->release_list))
4020 list_del_init(&cgrp->release_list); 4059 list_del_init(&cgrp->release_list);
4021 spin_unlock(&release_list_lock); 4060 raw_spin_unlock(&release_list_lock);
4022 4061
4023 cgroup_lock_hierarchy(cgrp->root); 4062 cgroup_lock_hierarchy(cgrp->root);
4024 /* delete this cgroup from parent->children */ 4063 /* delete this cgroup from parent->children */
@@ -4493,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
4493 * 4532 *
4494 * A pointer to the shared css_set was automatically copied in 4533 * A pointer to the shared css_set was automatically copied in
4495 * fork.c by dup_task_struct(). However, we ignore that copy, since 4534 * fork.c by dup_task_struct(). However, we ignore that copy, since
4496 * it was not made under the protection of RCU or cgroup_mutex, so 4535 * it was not made under the protection of RCU, cgroup_mutex or
4497 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4536 * threadgroup_change_begin(), so it might no longer be a valid
4498 * have already changed current->cgroups, allowing the previously 4537 * cgroup pointer. cgroup_attach_task() might have already changed
4499 * referenced cgroup group to be removed and freed. 4538 * current->cgroups, allowing the previously referenced cgroup
4539 * group to be removed and freed.
4540 *
4541 * Outside the pointer validity we also need to process the css_set
4542 * inheritance between threadgoup_change_begin() and
4543 * threadgoup_change_end(), this way there is no leak in any process
4544 * wide migration performed by cgroup_attach_proc() that could otherwise
4545 * miss a thread because it is too early or too late in the fork stage.
4500 * 4546 *
4501 * At the point that cgroup_fork() is called, 'current' is the parent 4547 * At the point that cgroup_fork() is called, 'current' is the parent
4502 * task, and the passed argument 'child' points to the child task. 4548 * task, and the passed argument 'child' points to the child task.
4503 */ 4549 */
4504void cgroup_fork(struct task_struct *child) 4550void cgroup_fork(struct task_struct *child)
4505{ 4551{
4506 task_lock(current); 4552 /*
4553 * We don't need to task_lock() current because current->cgroups
4554 * can't be changed concurrently here. The parent obviously hasn't
4555 * exited and called cgroup_exit(), and we are synchronized against
4556 * cgroup migration through threadgroup_change_begin().
4557 */
4507 child->cgroups = current->cgroups; 4558 child->cgroups = current->cgroups;
4508 get_css_set(child->cgroups); 4559 get_css_set(child->cgroups);
4509 task_unlock(current);
4510 INIT_LIST_HEAD(&child->cg_list); 4560 INIT_LIST_HEAD(&child->cg_list);
4511} 4561}
4512 4562
@@ -4548,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
4548{ 4598{
4549 if (use_task_css_set_links) { 4599 if (use_task_css_set_links) {
4550 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4551 task_lock(child); 4601 if (list_empty(&child->cg_list)) {
4552 if (list_empty(&child->cg_list)) 4602 /*
4603 * It's safe to use child->cgroups without task_lock()
4604 * here because we are protected through
4605 * threadgroup_change_begin() against concurrent
4606 * css_set change in cgroup_task_migrate(). Also
4607 * the task can't exit at that point until
4608 * wake_up_new_task() is called, so we are protected
4609 * against cgroup_exit() setting child->cgroup to
4610 * init_css_set.
4611 */
4553 list_add(&child->cg_list, &child->cgroups->tasks); 4612 list_add(&child->cg_list, &child->cgroups->tasks);
4554 task_unlock(child); 4613 }
4555 write_unlock(&css_set_lock); 4614 write_unlock(&css_set_lock);
4556 } 4615 }
4557} 4616}
@@ -4671,13 +4730,13 @@ static void check_for_release(struct cgroup *cgrp)
4671 * already queued for a userspace notification, queue 4730 * already queued for a userspace notification, queue
4672 * it now */ 4731 * it now */
4673 int need_schedule_work = 0; 4732 int need_schedule_work = 0;
4674 spin_lock(&release_list_lock); 4733 raw_spin_lock(&release_list_lock);
4675 if (!cgroup_is_removed(cgrp) && 4734 if (!cgroup_is_removed(cgrp) &&
4676 list_empty(&cgrp->release_list)) { 4735 list_empty(&cgrp->release_list)) {
4677 list_add(&cgrp->release_list, &release_list); 4736 list_add(&cgrp->release_list, &release_list);
4678 need_schedule_work = 1; 4737 need_schedule_work = 1;
4679 } 4738 }
4680 spin_unlock(&release_list_lock); 4739 raw_spin_unlock(&release_list_lock);
4681 if (need_schedule_work) 4740 if (need_schedule_work)
4682 schedule_work(&release_agent_work); 4741 schedule_work(&release_agent_work);
4683 } 4742 }
@@ -4729,7 +4788,7 @@ static void cgroup_release_agent(struct work_struct *work)
4729{ 4788{
4730 BUG_ON(work != &release_agent_work); 4789 BUG_ON(work != &release_agent_work);
4731 mutex_lock(&cgroup_mutex); 4790 mutex_lock(&cgroup_mutex);
4732 spin_lock(&release_list_lock); 4791 raw_spin_lock(&release_list_lock);
4733 while (!list_empty(&release_list)) { 4792 while (!list_empty(&release_list)) {
4734 char *argv[3], *envp[3]; 4793 char *argv[3], *envp[3];
4735 int i; 4794 int i;
@@ -4738,7 +4797,7 @@ static void cgroup_release_agent(struct work_struct *work)
4738 struct cgroup, 4797 struct cgroup,
4739 release_list); 4798 release_list);
4740 list_del_init(&cgrp->release_list); 4799 list_del_init(&cgrp->release_list);
4741 spin_unlock(&release_list_lock); 4800 raw_spin_unlock(&release_list_lock);
4742 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4801 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4743 if (!pathbuf) 4802 if (!pathbuf)
4744 goto continue_free; 4803 goto continue_free;
@@ -4768,9 +4827,9 @@ static void cgroup_release_agent(struct work_struct *work)
4768 continue_free: 4827 continue_free:
4769 kfree(pathbuf); 4828 kfree(pathbuf);
4770 kfree(agentbuf); 4829 kfree(agentbuf);
4771 spin_lock(&release_list_lock); 4830 raw_spin_lock(&release_list_lock);
4772 } 4831 }
4773 spin_unlock(&release_list_lock); 4832 raw_spin_unlock(&release_list_lock);
4774 mutex_unlock(&cgroup_mutex); 4833 mutex_unlock(&cgroup_mutex);
4775} 4834}
4776 4835
@@ -4880,9 +4939,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4880 4939
4881 rcu_assign_pointer(id->css, NULL); 4940 rcu_assign_pointer(id->css, NULL);
4882 rcu_assign_pointer(css->id, NULL); 4941 rcu_assign_pointer(css->id, NULL);
4883 spin_lock(&ss->id_lock); 4942 write_lock(&ss->id_lock);
4884 idr_remove(&ss->idr, id->id); 4943 idr_remove(&ss->idr, id->id);
4885 spin_unlock(&ss->id_lock); 4944 write_unlock(&ss->id_lock);
4886 kfree_rcu(id, rcu_head); 4945 kfree_rcu(id, rcu_head);
4887} 4946}
4888EXPORT_SYMBOL_GPL(free_css_id); 4947EXPORT_SYMBOL_GPL(free_css_id);
@@ -4908,10 +4967,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4908 error = -ENOMEM; 4967 error = -ENOMEM;
4909 goto err_out; 4968 goto err_out;
4910 } 4969 }
4911 spin_lock(&ss->id_lock); 4970 write_lock(&ss->id_lock);
4912 /* Don't use 0. allocates an ID of 1-65535 */ 4971 /* Don't use 0. allocates an ID of 1-65535 */
4913 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4914 spin_unlock(&ss->id_lock); 4973 write_unlock(&ss->id_lock);
4915 4974
4916 /* Returns error when there are no free spaces for new ID.*/ 4975 /* Returns error when there are no free spaces for new ID.*/
4917 if (error) { 4976 if (error) {
@@ -4926,9 +4985,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4926 return newid; 4985 return newid;
4927remove_idr: 4986remove_idr:
4928 error = -ENOSPC; 4987 error = -ENOSPC;
4929 spin_lock(&ss->id_lock); 4988 write_lock(&ss->id_lock);
4930 idr_remove(&ss->idr, myid); 4989 idr_remove(&ss->idr, myid);
4931 spin_unlock(&ss->id_lock); 4990 write_unlock(&ss->id_lock);
4932err_out: 4991err_out:
4933 kfree(newid); 4992 kfree(newid);
4934 return ERR_PTR(error); 4993 return ERR_PTR(error);
@@ -4940,7 +4999,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4940{ 4999{
4941 struct css_id *newid; 5000 struct css_id *newid;
4942 5001
4943 spin_lock_init(&ss->id_lock); 5002 rwlock_init(&ss->id_lock);
4944 idr_init(&ss->idr); 5003 idr_init(&ss->idr);
4945 5004
4946 newid = get_new_cssid(ss, 0); 5005 newid = get_new_cssid(ss, 0);
@@ -5035,9 +5094,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
5035 * scan next entry from bitmap(tree), tmpid is updated after 5094 * scan next entry from bitmap(tree), tmpid is updated after
5036 * idr_get_next(). 5095 * idr_get_next().
5037 */ 5096 */
5038 spin_lock(&ss->id_lock); 5097 read_lock(&ss->id_lock);
5039 tmp = idr_get_next(&ss->idr, &tmpid); 5098 tmp = idr_get_next(&ss->idr, &tmpid);
5040 spin_unlock(&ss->id_lock); 5099 read_unlock(&ss->id_lock);
5041 5100
5042 if (!tmp) 5101 if (!tmp)
5043 break; 5102 break;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e45..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/cgroup.h> 19#include <linux/cgroup.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51static inline int __cgroup_freezing_or_frozen(struct task_struct *task) 51bool cgroup_freezing(struct task_struct *task)
52{ 52{
53 enum freezer_state state = task_freezer(task)->state; 53 enum freezer_state state;
54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); 54 bool ret;
55}
56 55
57int cgroup_freezing_or_frozen(struct task_struct *task) 56 rcu_read_lock();
58{ 57 state = task_freezer(task)->state;
59 int result; 58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
60 task_lock(task); 59 rcu_read_unlock();
61 result = __cgroup_freezing_or_frozen(task); 60
62 task_unlock(task); 61 return ret;
63 return result;
64} 62}
65 63
66/* 64/*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
102 * freezer_can_attach(): 100 * freezer_can_attach():
103 * cgroup_mutex (held by caller of can_attach) 101 * cgroup_mutex (held by caller of can_attach)
104 * 102 *
105 * cgroup_freezing_or_frozen():
106 * task->alloc_lock (to get task's cgroup)
107 *
108 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
109 * freezer->lock 104 * freezer->lock
110 * sighand->siglock (if the cgroup is freezing) 105 * sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
130 * write_lock css_set_lock (cgroup iterator start) 125 * write_lock css_set_lock (cgroup iterator start)
131 * task->alloc_lock 126 * task->alloc_lock
132 * read_lock css_set_lock (cgroup iterator start) 127 * read_lock css_set_lock (cgroup iterator start)
133 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
134 * sighand->siglock 129 * sighand->siglock
135 */ 130 */
136static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
150static void freezer_destroy(struct cgroup_subsys *ss, 145static void freezer_destroy(struct cgroup_subsys *ss,
151 struct cgroup *cgroup) 146 struct cgroup *cgroup)
152{ 147{
153 kfree(cgroup_freezer(cgroup)); 148 struct freezer *freezer = cgroup_freezer(cgroup);
149
150 if (freezer->state != CGROUP_THAWED)
151 atomic_dec(&system_freezing_cnt);
152 kfree(freezer);
153}
154
155/* task is frozen or will freeze immediately when next it gets woken */
156static bool is_task_frozen_enough(struct task_struct *task)
157{
158 return frozen(task) ||
159 (task_is_stopped_or_traced(task) && freezing(task));
154} 160}
155 161
156/* 162/*
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 166 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 167static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 168 struct cgroup *new_cgroup,
163 struct task_struct *task) 169 struct cgroup_taskset *tset)
164{ 170{
165 struct freezer *freezer; 171 struct freezer *freezer;
172 struct task_struct *task;
166 173
167 /* 174 /*
168 * Anything frozen can't move or be moved to/from. 175 * Anything frozen can't move or be moved to/from.
169 */ 176 */
177 cgroup_taskset_for_each(task, new_cgroup, tset)
178 if (cgroup_freezing(task))
179 return -EBUSY;
170 180
171 freezer = cgroup_freezer(new_cgroup); 181 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED) 182 if (freezer->state != CGROUP_THAWED)
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
175 return 0; 185 return 0;
176} 186}
177 187
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
180 rcu_read_lock();
181 if (__cgroup_freezing_or_frozen(tsk)) {
182 rcu_read_unlock();
183 return -EBUSY;
184 }
185 rcu_read_unlock();
186 return 0;
187}
188
189static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
190{ 189{
191 struct freezer *freezer; 190 struct freezer *freezer;
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
213 212
214 /* Locking avoids race with FREEZING -> THAWED transitions. */ 213 /* Locking avoids race with FREEZING -> THAWED transitions. */
215 if (freezer->state == CGROUP_FREEZING) 214 if (freezer->state == CGROUP_FREEZING)
216 freeze_task(task, true); 215 freeze_task(task);
217 spin_unlock_irq(&freezer->lock); 216 spin_unlock_irq(&freezer->lock);
218} 217}
219 218
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
231 cgroup_iter_start(cgroup, &it); 230 cgroup_iter_start(cgroup, &it);
232 while ((task = cgroup_iter_next(cgroup, &it))) { 231 while ((task = cgroup_iter_next(cgroup, &it))) {
233 ntotal++; 232 ntotal++;
234 if (frozen(task)) 233 if (freezing(task) && is_task_frozen_enough(task))
235 nfrozen++; 234 nfrozen++;
236 } 235 }
237 236
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
279 struct task_struct *task; 278 struct task_struct *task;
280 unsigned int num_cant_freeze_now = 0; 279 unsigned int num_cant_freeze_now = 0;
281 280
282 freezer->state = CGROUP_FREEZING;
283 cgroup_iter_start(cgroup, &it); 281 cgroup_iter_start(cgroup, &it);
284 while ((task = cgroup_iter_next(cgroup, &it))) { 282 while ((task = cgroup_iter_next(cgroup, &it))) {
285 if (!freeze_task(task, true)) 283 if (!freeze_task(task))
286 continue; 284 continue;
287 if (frozen(task)) 285 if (is_task_frozen_enough(task))
288 continue; 286 continue;
289 if (!freezing(task) && !freezer_should_skip(task)) 287 if (!freezing(task) && !freezer_should_skip(task))
290 num_cant_freeze_now++; 288 num_cant_freeze_now++;
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 struct task_struct *task; 298 struct task_struct *task;
301 299
302 cgroup_iter_start(cgroup, &it); 300 cgroup_iter_start(cgroup, &it);
303 while ((task = cgroup_iter_next(cgroup, &it))) { 301 while ((task = cgroup_iter_next(cgroup, &it)))
304 thaw_process(task); 302 __thaw_task(task);
305 }
306 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
307
308 freezer->state = CGROUP_THAWED;
309} 304}
310 305
311static int freezer_change_state(struct cgroup *cgroup, 306static int freezer_change_state(struct cgroup *cgroup,
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
319 spin_lock_irq(&freezer->lock); 314 spin_lock_irq(&freezer->lock);
320 315
321 update_if_frozen(cgroup, freezer); 316 update_if_frozen(cgroup, freezer);
322 if (goal_state == freezer->state)
323 goto out;
324 317
325 switch (goal_state) { 318 switch (goal_state) {
326 case CGROUP_THAWED: 319 case CGROUP_THAWED:
320 if (freezer->state != CGROUP_THAWED)
321 atomic_dec(&system_freezing_cnt);
322 freezer->state = CGROUP_THAWED;
327 unfreeze_cgroup(cgroup, freezer); 323 unfreeze_cgroup(cgroup, freezer);
328 break; 324 break;
329 case CGROUP_FROZEN: 325 case CGROUP_FROZEN:
326 if (freezer->state == CGROUP_THAWED)
327 atomic_inc(&system_freezing_cnt);
328 freezer->state = CGROUP_FREEZING;
330 retval = try_to_freeze_cgroup(cgroup, freezer); 329 retval = try_to_freeze_cgroup(cgroup, freezer);
331 break; 330 break;
332 default: 331 default:
333 BUG(); 332 BUG();
334 } 333 }
335out: 334
336 spin_unlock_irq(&freezer->lock); 335 spin_unlock_irq(&freezer->lock);
337 336
338 return retval; 337 return retval;
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
381 .populate = freezer_populate, 380 .populate = freezer_populate,
382 .subsys_id = freezer_subsys_id, 381 .subsys_id = freezer_subsys_id,
383 .can_attach = freezer_can_attach, 382 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
387 .attach = NULL,
388 .fork = freezer_fork, 383 .fork = freezer_fork,
389 .exit = NULL,
390}; 384};
diff --git a/kernel/compat.c b/kernel/compat.c
index e2435ee9993a..f346cedfe24d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/export.h>
24#include <linux/migrate.h> 25#include <linux/migrate.h>
25#include <linux/posix-timers.h> 26#include <linux/posix-timers.h>
26#include <linux/times.h> 27#include <linux/times.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b1..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,11 +10,12 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/unistd.h> 11#include <linux/unistd.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h>
18 19
19#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 21/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -177,8 +178,7 @@ static inline void check_for_tasks(int cpu)
177 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
178 for_each_process(p) { 179 for_each_process(p) {
179 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
180 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
181 !cputime_eq(p->stime, cputime_zero)))
182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
183 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
184 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
@@ -379,6 +379,7 @@ out:
379 cpu_maps_update_done(); 379 cpu_maps_update_done();
380 return err; 380 return err;
381} 381}
382EXPORT_SYMBOL_GPL(cpu_up);
382 383
383#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
384static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
@@ -469,13 +470,86 @@ out:
469 cpu_maps_update_done(); 470 cpu_maps_update_done();
470} 471}
471 472
472static int alloc_frozen_cpus(void) 473static int __init alloc_frozen_cpus(void)
473{ 474{
474 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
475 return -ENOMEM; 476 return -ENOMEM;
476 return 0; 477 return 0;
477} 478}
478core_initcall(alloc_frozen_cpus); 479core_initcall(alloc_frozen_cpus);
480
481/*
482 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
483 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
484 * to continue until any currently running CPU hotplug operation gets
485 * completed.
486 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
487 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
488 * CPU hotplug path and released only after it is complete. Thus, we
489 * (and hence the freezer) will block here until any currently running CPU
490 * hotplug operation gets completed.
491 */
492void cpu_hotplug_disable_before_freeze(void)
493{
494 cpu_maps_update_begin();
495 cpu_hotplug_disabled = 1;
496 cpu_maps_update_done();
497}
498
499
500/*
501 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
502 * disabled while beginning to freeze tasks).
503 */
504void cpu_hotplug_enable_after_thaw(void)
505{
506 cpu_maps_update_begin();
507 cpu_hotplug_disabled = 0;
508 cpu_maps_update_done();
509}
510
511/*
512 * When callbacks for CPU hotplug notifications are being executed, we must
513 * ensure that the state of the system with respect to the tasks being frozen
514 * or not, as reported by the notification, remains unchanged *throughout the
515 * duration* of the execution of the callbacks.
516 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
517 *
518 * This synchronization is implemented by mutually excluding regular CPU
519 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
520 * Hibernate notifications.
521 */
522static int
523cpu_hotplug_pm_callback(struct notifier_block *nb,
524 unsigned long action, void *ptr)
525{
526 switch (action) {
527
528 case PM_SUSPEND_PREPARE:
529 case PM_HIBERNATION_PREPARE:
530 cpu_hotplug_disable_before_freeze();
531 break;
532
533 case PM_POST_SUSPEND:
534 case PM_POST_HIBERNATION:
535 cpu_hotplug_enable_after_thaw();
536 break;
537
538 default:
539 return NOTIFY_DONE;
540 }
541
542 return NOTIFY_OK;
543}
544
545
546static int __init cpu_hotplug_pm_sync_init(void)
547{
548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0;
550}
551core_initcall(cpu_hotplug_pm_sync_init);
552
479#endif /* CONFIG_PM_SLEEP_SMP */ 553#endif /* CONFIG_PM_SLEEP_SMP */
480 554
481/** 555/**
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 000000000000..249152e15308
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
1/*
2 * Copyright (C) 2011 Google, Inc.
3 *
4 * Author:
5 * Colin Cross <ccross@android.com>
6 *
7 * This software is licensed under the terms of the GNU General Public
8 * License version 2, as published by the Free Software Foundation, and
9 * may be copied, distributed, and modified under those terms.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/cpu_pm.h>
20#include <linux/module.h>
21#include <linux/notifier.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24
25static DEFINE_RWLOCK(cpu_pm_notifier_lock);
26static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
27
28static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
29{
30 int ret;
31
32 ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
33 nr_to_call, nr_calls);
34
35 return notifier_to_errno(ret);
36}
37
38/**
39 * cpu_pm_register_notifier - register a driver with cpu_pm
40 * @nb: notifier block to register
41 *
42 * Add a driver to a list of drivers that are notified about
43 * CPU and CPU cluster low power entry and exit.
44 *
45 * This function may sleep, and has the same return conditions as
46 * raw_notifier_chain_register.
47 */
48int cpu_pm_register_notifier(struct notifier_block *nb)
49{
50 unsigned long flags;
51 int ret;
52
53 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
54 ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
55 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
60
61/**
62 * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
63 * @nb: notifier block to be unregistered
64 *
65 * Remove a driver from the CPU PM notifier list.
66 *
67 * This function may sleep, and has the same return conditions as
68 * raw_notifier_chain_unregister.
69 */
70int cpu_pm_unregister_notifier(struct notifier_block *nb)
71{
72 unsigned long flags;
73 int ret;
74
75 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
76 ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
77 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
78
79 return ret;
80}
81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
82
83/**
84 * cpm_pm_enter - CPU low power entry notifier
85 *
86 * Notifies listeners that a single CPU is entering a low power state that may
87 * cause some blocks in the same power domain as the cpu to reset.
88 *
89 * Must be called on the affected CPU with interrupts disabled. Platform is
90 * responsible for ensuring that cpu_pm_enter is not called twice on the same
91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
92 * co-processor, interrupt controller and it's PM extensions, local CPU
93 * timers context save/restore which shouldn't be interrupted. Hence it
94 * must be called with interrupts disabled.
95 *
96 * Return conditions are same as __raw_notifier_call_chain.
97 */
98int cpu_pm_enter(void)
99{
100 int nr_calls;
101 int ret = 0;
102
103 read_lock(&cpu_pm_notifier_lock);
104 ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
105 if (ret)
106 /*
107 * Inform listeners (nr_calls - 1) about failure of CPU PM
108 * PM entry who are notified earlier to prepare for it.
109 */
110 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
111 read_unlock(&cpu_pm_notifier_lock);
112
113 return ret;
114}
115EXPORT_SYMBOL_GPL(cpu_pm_enter);
116
117/**
118 * cpm_pm_exit - CPU low power exit notifier
119 *
120 * Notifies listeners that a single CPU is exiting a low power state that may
121 * have caused some blocks in the same power domain as the cpu to reset.
122 *
123 * Notified drivers can include VFP co-processor, interrupt controller
124 * and it's PM extensions, local CPU timers context save/restore which
125 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
126 *
127 * Return conditions are same as __raw_notifier_call_chain.
128 */
129int cpu_pm_exit(void)
130{
131 int ret;
132
133 read_lock(&cpu_pm_notifier_lock);
134 ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
135 read_unlock(&cpu_pm_notifier_lock);
136
137 return ret;
138}
139EXPORT_SYMBOL_GPL(cpu_pm_exit);
140
141/**
142 * cpm_cluster_pm_enter - CPU cluster low power entry notifier
143 *
144 * Notifies listeners that all cpus in a power domain are entering a low power
145 * state that may cause some blocks in the same power domain to reset.
146 *
147 * Must be called after cpu_pm_enter has been called on all cpus in the power
148 * domain, and before cpu_pm_exit has been called on any cpu in the power
149 * domain. Notified drivers can include VFP co-processor, interrupt controller
150 * and it's PM extensions, local CPU timers context save/restore which
151 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
152 *
153 * Must be called with interrupts disabled.
154 *
155 * Return conditions are same as __raw_notifier_call_chain.
156 */
157int cpu_cluster_pm_enter(void)
158{
159 int nr_calls;
160 int ret = 0;
161
162 read_lock(&cpu_pm_notifier_lock);
163 ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
164 if (ret)
165 /*
166 * Inform listeners (nr_calls - 1) about failure of CPU cluster
167 * PM entry who are notified earlier to prepare for it.
168 */
169 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
170 read_unlock(&cpu_pm_notifier_lock);
171
172 return ret;
173}
174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
175
176/**
177 * cpm_cluster_pm_exit - CPU cluster low power exit notifier
178 *
179 * Notifies listeners that all cpus in a power domain are exiting form a
180 * low power state that may have caused some blocks in the same power domain
181 * to reset.
182 *
183 * Must be called after cpu_pm_exit has been called on all cpus in the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and it's PM extensions, local CPU timers context save/restore which
187 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
188 *
189 * Return conditions are same as __raw_notifier_call_chain.
190 */
191int cpu_cluster_pm_exit(void)
192{
193 int ret;
194
195 read_lock(&cpu_pm_notifier_lock);
196 ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
197 read_unlock(&cpu_pm_notifier_lock);
198
199 return ret;
200}
201EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
202
203#ifdef CONFIG_PM
204static int cpu_pm_suspend(void)
205{
206 int ret;
207
208 ret = cpu_pm_enter();
209 if (ret)
210 return ret;
211
212 ret = cpu_cluster_pm_enter();
213 return ret;
214}
215
216static void cpu_pm_resume(void)
217{
218 cpu_cluster_pm_exit();
219 cpu_pm_exit();
220}
221
222static struct syscore_ops cpu_pm_syscore_ops = {
223 .suspend = cpu_pm_suspend,
224 .resume = cpu_pm_resume,
225};
226
227static int cpu_pm_init(void)
228{
229 register_syscore_ops(&cpu_pm_syscore_ops);
230 return 0;
231}
232core_initcall(cpu_pm_init);
233#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff70..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h> 39#include <linux/memory.h>
40#include <linux/module.h> 40#include <linux/export.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/namei.h> 42#include <linux/namei.h>
43#include <linux/pagemap.h> 43#include <linux/pagemap.h>
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 123 struct cpuset, css);
124} 124}
125 125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
126/* bits in struct cpuset flags field */ 139/* bits in struct cpuset flags field */
127typedef enum { 140typedef enum {
128 CS_CPU_EXCLUSIVE, 141 CS_CPU_EXCLUSIVE,
@@ -949,6 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949static void cpuset_change_task_nodemask(struct task_struct *tsk, 962static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems) 963 nodemask_t *newmems)
951{ 964{
965 bool need_loop;
966
952repeat: 967repeat:
953 /* 968 /*
954 * Allow tasks that have access to memory reserves because they have 969 * Allow tasks that have access to memory reserves because they have
@@ -960,10 +975,17 @@ repeat:
960 return; 975 return;
961 976
962 task_lock(tsk); 977 task_lock(tsk);
978 /*
979 * Determine if a loop is necessary if another thread is doing
980 * get_mems_allowed(). If at least one node remains unchanged and
981 * tsk does not have a mempolicy, then an empty nodemask will not be
982 * possible when mems_allowed is larger than a word.
983 */
984 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed);
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965 988
966
967 /* 989 /*
968 * ensure checking ->mems_allowed_change_disable after setting all new 990 * ensure checking ->mems_allowed_change_disable after setting all new
969 * allowed nodes. 991 * allowed nodes.
@@ -982,7 +1004,7 @@ repeat:
982 * Allocation of memory is very fast, we needn't sleep when waiting 1004 * Allocation of memory is very fast, we needn't sleep when waiting
983 * for the read-side. 1005 * for the read-side.
984 */ 1006 */
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { 1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk); 1008 task_unlock(tsk);
987 if (!task_curr(tsk)) 1009 if (!task_curr(tsk))
988 yield(); 1010 yield();
@@ -1367,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
1367 return val; 1389 return val;
1368} 1390}
1369 1391
1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1372 struct task_struct *tsk)
1373{
1374 struct cpuset *cs = cgroup_cs(cont);
1375
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1377 return -ENOSPC;
1378
1379 /*
1380 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1381 * cannot change their cpu affinity and isolating such threads by their
1382 * set of allowed nodes is unnecessary. Thus, cpusets are not
1383 * applicable for such threads. This prevents checking for success of
1384 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1385 * be changed.
1386 */
1387 if (tsk->flags & PF_THREAD_BOUND)
1388 return -EINVAL;
1389
1390 return 0;
1391}
1392
1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/* 1392/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because 1393 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must 1394 * dynamically allocating them is not allowed in can_attach, and they must
1401 * persist among pre_attach, attach_task, and attach. 1395 * persist until attach.
1402 */ 1396 */
1403static cpumask_var_t cpus_attach; 1397static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from; 1398static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to; 1399static nodemask_t cpuset_attach_nodemask_to;
1406 1400
1407/* Set-up work for before attaching each task. */ 1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1408static void cpuset_pre_attach(struct cgroup *cont) 1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1403 struct cgroup_taskset *tset)
1409{ 1404{
1410 struct cpuset *cs = cgroup_cs(cont); 1405 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task;
1407 int ret;
1408
1409 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1410 return -ENOSPC;
1411
1412 cgroup_taskset_for_each(task, cgrp, tset) {
1413 /*
1414 * Kthreads bound to specific cpus cannot be moved to a new
1415 * cpuset; we cannot change their cpu affinity and
1416 * isolating such threads by their set of allowed nodes is
1417 * unnecessary. Thus, cpusets are not applicable for such
1418 * threads. This prevents checking for success of
1419 * set_cpus_allowed_ptr() on all attached tasks before
1420 * cpus_allowed may be changed.
1421 */
1422 if (task->flags & PF_THREAD_BOUND)
1423 return -EINVAL;
1424 if ((ret = security_task_setscheduler(task)))
1425 return ret;
1426 }
1411 1427
1428 /* prepare for attach */
1412 if (cs == &top_cpuset) 1429 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask); 1430 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else 1431 else
1415 guarantee_online_cpus(cs, cpus_attach); 1432 guarantee_online_cpus(cs, cpus_attach);
1416 1433
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419 1435
1420/* Per-thread attachment work. */ 1436 return 0;
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1422{
1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1426 /*
1427 * can_attach beforehand should guarantee that this doesn't fail.
1428 * TODO: have a better way to handle failure here
1429 */
1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1431 WARN_ON_ONCE(err);
1432
1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1434 cpuset_update_task_spread_flag(cs, tsk);
1435} 1437}
1436 1438
1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1438 struct cgroup *oldcont, struct task_struct *tsk) 1440 struct cgroup_taskset *tset)
1439{ 1441{
1440 struct mm_struct *mm; 1442 struct mm_struct *mm;
1441 struct cpuset *cs = cgroup_cs(cont); 1443 struct task_struct *task;
1442 struct cpuset *oldcs = cgroup_cs(oldcont); 1444 struct task_struct *leader = cgroup_taskset_first(tset);
1445 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1446 struct cpuset *cs = cgroup_cs(cgrp);
1447 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1448
1449 cgroup_taskset_for_each(task, cgrp, tset) {
1450 /*
1451 * can_attach beforehand should guarantee that this doesn't
1452 * fail. TODO: have a better way to handle failure here
1453 */
1454 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1455
1456 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1457 cpuset_update_task_spread_flag(cs, task);
1458 }
1443 1459
1444 /* 1460 /*
1445 * Change mm, possibly for multiple threads in a threadgroup. This is 1461 * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1447,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1447 */ 1463 */
1448 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1464 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1449 cpuset_attach_nodemask_to = cs->mems_allowed; 1465 cpuset_attach_nodemask_to = cs->mems_allowed;
1450 mm = get_task_mm(tsk); 1466 mm = get_task_mm(leader);
1451 if (mm) { 1467 if (mm) {
1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1468 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1453 if (is_memory_migrate(cs)) 1469 if (is_memory_migrate(cs))
@@ -1903,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
1903 .create = cpuset_create, 1919 .create = cpuset_create,
1904 .destroy = cpuset_destroy, 1920 .destroy = cpuset_destroy,
1905 .can_attach = cpuset_can_attach, 1921 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1909 .attach = cpuset_attach, 1922 .attach = cpuset_attach,
1910 .populate = cpuset_populate, 1923 .populate = cpuset_populate,
1911 .post_clone = cpuset_post_clone, 1924 .post_clone = cpuset_post_clone,
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d4..c766ee54c0b1 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
2#include <linux/crash_dump.h> 2#include <linux/crash_dump.h>
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/errno.h> 4#include <linux/errno.h>
5#include <linux/module.h> 5#include <linux/export.h>
6 6
7/* 7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need 8 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,8 +20,15 @@ unsigned long saved_max_pfn;
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21 21
22/* 22/*
23 * stores the size of elf header of crash image
24 */
25unsigned long long elfcorehdr_size;
26
27/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed 28 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel. 29 * kernel. This option will be passed by kexec loader to the capture kernel.
30 *
31 * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
25 */ 32 */
26static int __init setup_elfcorehdr(char *arg) 33static int __init setup_elfcorehdr(char *arg)
27{ 34{
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg)
29 if (!arg) 36 if (!arg)
30 return -EINVAL; 37 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end); 38 elfcorehdr_addr = memparse(arg, &end);
39 if (*end == '@') {
40 elfcorehdr_size = elfcorehdr_addr;
41 elfcorehdr_addr = memparse(end + 1, &end);
42 }
32 return end > arg ? 0 : -EINVAL; 43 return end > arg ? 0 : -EINVAL;
33} 44}
34early_param("elfcorehdr", setup_elfcorehdr); 45early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 8ef31f53c44c..5791612a4045 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11#include <linux/module.h> 11#include <linux/export.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
@@ -644,6 +644,9 @@ void __init cred_init(void)
644 */ 644 */
645struct cred *prepare_kernel_cred(struct task_struct *daemon) 645struct cred *prepare_kernel_cred(struct task_struct *daemon)
646{ 646{
647#ifdef CONFIG_KEYS
648 struct thread_group_cred *tgcred;
649#endif
647 const struct cred *old; 650 const struct cred *old;
648 struct cred *new; 651 struct cred *new;
649 652
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
651 if (!new) 654 if (!new)
652 return NULL; 655 return NULL;
653 656
657#ifdef CONFIG_KEYS
658 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
659 if (!tgcred) {
660 kmem_cache_free(cred_jar, new);
661 return NULL;
662 }
663#endif
664
654 kdebug("prepare_kernel_cred() alloc %p", new); 665 kdebug("prepare_kernel_cred() alloc %p", new);
655 666
656 if (daemon) 667 if (daemon)
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
667 get_group_info(new->group_info); 678 get_group_info(new->group_info);
668 679
669#ifdef CONFIG_KEYS 680#ifdef CONFIG_KEYS
670 atomic_inc(&init_tgcred.usage); 681 atomic_set(&tgcred->usage, 1);
671 new->tgcred = &init_tgcred; 682 spin_lock_init(&tgcred->lock);
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
672 new->request_key_auth = NULL; 686 new->request_key_auth = NULL;
673 new->thread_keyring = NULL; 687 new->thread_keyring = NULL;
674 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 34872482315e..c22d8c28ad84 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
217 217
218 /* Pack in hex chars */ 218 /* Pack in hex chars */
219 for (i = 0; i < wcount; i++) 219 for (i = 0; i < wcount; i++)
220 bufptr = pack_hex_byte(bufptr, s[i]); 220 bufptr = hex_byte_pack(bufptr, s[i]);
221 *bufptr = '\0'; 221 *bufptr = '\0';
222 222
223 /* Move up */ 223 /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
249 if (err) 249 if (err)
250 return NULL; 250 return NULL;
251 while (count > 0) { 251 while (count > 0) {
252 buf = pack_hex_byte(buf, *tmp); 252 buf = hex_byte_pack(buf, *tmp);
253 tmp++; 253 tmp++;
254 count--; 254 count--;
255 } 255 }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
411 limit = id + (BUF_THREAD_ID_SIZE / 2); 411 limit = id + (BUF_THREAD_ID_SIZE / 2);
412 while (id < limit) { 412 while (id < limit) {
413 if (!lzero || *id != 0) { 413 if (!lzero || *id != 0) {
414 pkt = pack_hex_byte(pkt, *id); 414 pkt = hex_byte_pack(pkt, *id);
415 lzero = 0; 415 lzero = 0;
416 } 416 }
417 id++; 417 id++;
418 } 418 }
419 419
420 if (lzero) 420 if (lzero)
421 pkt = pack_hex_byte(pkt, 0); 421 pkt = hex_byte_pack(pkt, 0);
422 422
423 return pkt; 423 return pkt;
424} 424}
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
486 dbg_remove_all_break(); 486 dbg_remove_all_break();
487 487
488 remcom_out_buffer[0] = 'S'; 488 remcom_out_buffer[0] = 'S';
489 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 489 hex_byte_pack(&remcom_out_buffer[1], ks->signo);
490} 490}
491 491
492static void gdb_get_regs_helper(struct kgdb_state *ks) 492static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
954 /* Reply to host that an exception has occurred */ 954 /* Reply to host that an exception has occurred */
955 ptr = remcom_out_buffer; 955 ptr = remcom_out_buffer;
956 *ptr++ = 'T'; 956 *ptr++ = 'T';
957 ptr = pack_hex_byte(ptr, ks->signo); 957 ptr = hex_byte_pack(ptr, ks->signo);
958 ptr += strlen(strcpy(ptr, "thread:")); 958 ptr += strlen(strcpy(ptr, "thread:"));
959 int_to_threadref(thref, shadow_pid(current->pid)); 959 int_to_threadref(thref, shadow_pid(current->pid));
960 ptr = pack_threadid(ptr, thref); 960 ptr = pack_threadid(ptr, thref);
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index d9ca9aa481ec..8b68ce78ff17 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,6 +11,7 @@
11#include <linux/kgdb.h> 11#include <linux/kgdb.h>
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h>
14#include "kdb_private.h" 15#include "kdb_private.h"
15#include "../debug_core.h" 16#include "../debug_core.h"
16 17
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
1982 kdb_printf("%-20s%8u 0x%p ", mod->name, 1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1983 mod->core_size, (void *)mod); 1983 mod->core_size, (void *)mod);
1984#ifdef CONFIG_MODULE_UNLOAD 1984#ifdef CONFIG_MODULE_UNLOAD
1985 kdb_printf("%4d ", module_refcount(mod)); 1985 kdb_printf("%4ld ", module_refcount(mod));
1986#endif 1986#endif
1987 if (mod->state == MODULE_STATE_GOING) 1987 if (mod->state == MODULE_STATE_GOING)
1988 kdb_printf(" (Unloading)"); 1988 kdb_printf(" (Unloading)");
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c5304..68a2306522c8 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
9 * [It also happened to remove the sizeof(char *) == sizeof(int) 9 * [It also happened to remove the sizeof(char *) == sizeof(int)
10 * assumption introduced because of those /proc/dma patches. -- Hennus] 10 * assumption introduced because of those /proc/dma patches. -- Hennus]
11 */ 11 */
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o 5obj-y := core.o ring_buffer.o callchain.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..057e24b665cf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,191 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118 if (err)
119 release_callchain_buffers();
120exit:
121 mutex_unlock(&callchain_mutex);
122
123 return err;
124}
125
126void put_callchain_buffers(void)
127{
128 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
129 release_callchain_buffers();
130 mutex_unlock(&callchain_mutex);
131 }
132}
133
134static struct perf_callchain_entry *get_callchain_entry(int *rctx)
135{
136 int cpu;
137 struct callchain_cpus_entries *entries;
138
139 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
140 if (*rctx == -1)
141 return NULL;
142
143 entries = rcu_dereference(callchain_cpus_entries);
144 if (!entries)
145 return NULL;
146
147 cpu = smp_processor_id();
148
149 return &entries->cpu_entries[cpu][*rctx];
150}
151
152static void
153put_callchain_entry(int rctx)
154{
155 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
156}
157
158struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
159{
160 int rctx;
161 struct perf_callchain_entry *entry;
162
163
164 entry = get_callchain_entry(&rctx);
165 if (rctx == -1)
166 return NULL;
167
168 if (!entry)
169 goto exit_put;
170
171 entry->nr = 0;
172
173 if (!user_mode(regs)) {
174 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
175 perf_callchain_kernel(entry, regs);
176 if (current->mm)
177 regs = task_pt_regs(current);
178 else
179 regs = NULL;
180 }
181
182 if (regs) {
183 perf_callchain_store(entry, PERF_CONTEXT_USER);
184 perf_callchain_user(entry, regs);
185 }
186
187exit_put:
188 put_callchain_entry(rctx);
189
190 return entry;
191}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f857782d06f..a8f4ac001a00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h> 27#include <linux/device.h>
28#include <linux/export.h>
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/hardirq.h> 30#include <linux/hardirq.h>
30#include <linux/rculist.h> 31#include <linux/rculist.h>
@@ -127,7 +128,7 @@ enum event_type_t {
127 * perf_sched_events : >0 events exist 128 * perf_sched_events : >0 events exist
128 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
129 */ 130 */
130struct jump_label_key perf_sched_events __read_mostly; 131struct jump_label_key_deferred perf_sched_events __read_mostly;
131static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
132 133
133static atomic_t nr_mmap_events __read_mostly; 134static atomic_t nr_mmap_events __read_mostly;
@@ -184,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
184static void update_context_time(struct perf_event_context *ctx); 185static void update_context_time(struct perf_event_context *ctx);
185static u64 perf_event_time(struct perf_event *event); 186static u64 perf_event_time(struct perf_event *event);
186 187
188static void ring_buffer_attach(struct perf_event *event,
189 struct ring_buffer *rb);
190
187void __weak perf_event_print_debug(void) { } 191void __weak perf_event_print_debug(void) { }
188 192
189extern __weak const char *perf_pmu_name(void) 193extern __weak const char *perf_pmu_name(void)
@@ -1126,6 +1130,8 @@ event_sched_out(struct perf_event *event,
1126 if (!is_software_event(event)) 1130 if (!is_software_event(event))
1127 cpuctx->active_oncpu--; 1131 cpuctx->active_oncpu--;
1128 ctx->nr_active--; 1132 ctx->nr_active--;
1133 if (event->attr.freq && event->attr.sample_freq)
1134 ctx->nr_freq--;
1129 if (event->attr.exclusive || !cpuctx->active_oncpu) 1135 if (event->attr.exclusive || !cpuctx->active_oncpu)
1130 cpuctx->exclusive = 0; 1136 cpuctx->exclusive = 0;
1131} 1137}
@@ -1321,6 +1327,7 @@ retry:
1321 } 1327 }
1322 raw_spin_unlock_irq(&ctx->lock); 1328 raw_spin_unlock_irq(&ctx->lock);
1323} 1329}
1330EXPORT_SYMBOL_GPL(perf_event_disable);
1324 1331
1325static void perf_set_shadow_time(struct perf_event *event, 1332static void perf_set_shadow_time(struct perf_event *event,
1326 struct perf_event_context *ctx, 1333 struct perf_event_context *ctx,
@@ -1402,6 +1409,8 @@ event_sched_in(struct perf_event *event,
1402 if (!is_software_event(event)) 1409 if (!is_software_event(event))
1403 cpuctx->active_oncpu++; 1410 cpuctx->active_oncpu++;
1404 ctx->nr_active++; 1411 ctx->nr_active++;
1412 if (event->attr.freq && event->attr.sample_freq)
1413 ctx->nr_freq++;
1405 1414
1406 if (event->attr.exclusive) 1415 if (event->attr.exclusive)
1407 cpuctx->exclusive = 1; 1416 cpuctx->exclusive = 1;
@@ -1658,8 +1667,7 @@ retry:
1658 * Note: this works for group members as well as group leaders 1667 * Note: this works for group members as well as group leaders
1659 * since the non-leader members' sibling_lists will be empty. 1668 * since the non-leader members' sibling_lists will be empty.
1660 */ 1669 */
1661static void __perf_event_mark_enabled(struct perf_event *event, 1670static void __perf_event_mark_enabled(struct perf_event *event)
1662 struct perf_event_context *ctx)
1663{ 1671{
1664 struct perf_event *sub; 1672 struct perf_event *sub;
1665 u64 tstamp = perf_event_time(event); 1673 u64 tstamp = perf_event_time(event);
@@ -1697,7 +1705,7 @@ static int __perf_event_enable(void *info)
1697 */ 1705 */
1698 perf_cgroup_set_timestamp(current, ctx); 1706 perf_cgroup_set_timestamp(current, ctx);
1699 1707
1700 __perf_event_mark_enabled(event, ctx); 1708 __perf_event_mark_enabled(event);
1701 1709
1702 if (!event_filter_match(event)) { 1710 if (!event_filter_match(event)) {
1703 if (is_cgroup_event(event)) 1711 if (is_cgroup_event(event))
@@ -1778,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
1778 1786
1779retry: 1787retry:
1780 if (!ctx->is_active) { 1788 if (!ctx->is_active) {
1781 __perf_event_mark_enabled(event, ctx); 1789 __perf_event_mark_enabled(event);
1782 goto out; 1790 goto out;
1783 } 1791 }
1784 1792
@@ -1805,6 +1813,7 @@ retry:
1805out: 1813out:
1806 raw_spin_unlock_irq(&ctx->lock); 1814 raw_spin_unlock_irq(&ctx->lock);
1807} 1815}
1816EXPORT_SYMBOL_GPL(perf_event_enable);
1808 1817
1809int perf_event_refresh(struct perf_event *event, int refresh) 1818int perf_event_refresh(struct perf_event *event, int refresh)
1810{ 1819{
@@ -2170,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2170 */ 2179 */
2171 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2180 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2172 2181
2173 perf_event_sched_in(cpuctx, ctx, task); 2182 if (ctx->nr_events)
2183 cpuctx->task_ctx = ctx;
2174 2184
2175 cpuctx->task_ctx = ctx; 2185 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2176 2186
2177 perf_pmu_enable(ctx->pmu); 2187 perf_pmu_enable(ctx->pmu);
2178 perf_ctx_unlock(cpuctx, ctx); 2188 perf_ctx_unlock(cpuctx, ctx);
@@ -2322,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2322 u64 interrupts, now; 2332 u64 interrupts, now;
2323 s64 delta; 2333 s64 delta;
2324 2334
2335 if (!ctx->nr_freq)
2336 return;
2337
2325 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2338 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2326 if (event->state != PERF_EVENT_STATE_ACTIVE) 2339 if (event->state != PERF_EVENT_STATE_ACTIVE)
2327 continue; 2340 continue;
@@ -2377,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2377{ 2390{
2378 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; 2391 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2379 struct perf_event_context *ctx = NULL; 2392 struct perf_event_context *ctx = NULL;
2380 int rotate = 0, remove = 1; 2393 int rotate = 0, remove = 1, freq = 0;
2381 2394
2382 if (cpuctx->ctx.nr_events) { 2395 if (cpuctx->ctx.nr_events) {
2383 remove = 0; 2396 remove = 0;
2384 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2397 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2385 rotate = 1; 2398 rotate = 1;
2399 if (cpuctx->ctx.nr_freq)
2400 freq = 1;
2386 } 2401 }
2387 2402
2388 ctx = cpuctx->task_ctx; 2403 ctx = cpuctx->task_ctx;
@@ -2390,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2390 remove = 0; 2405 remove = 0;
2391 if (ctx->nr_events != ctx->nr_active) 2406 if (ctx->nr_events != ctx->nr_active)
2392 rotate = 1; 2407 rotate = 1;
2408 if (ctx->nr_freq)
2409 freq = 1;
2393 } 2410 }
2394 2411
2412 if (!rotate && !freq)
2413 goto done;
2414
2395 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2415 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2396 perf_pmu_disable(cpuctx->ctx.pmu); 2416 perf_pmu_disable(cpuctx->ctx.pmu);
2397 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2398 if (ctx)
2399 perf_ctx_adjust_freq(ctx, interval);
2400 2417
2401 if (!rotate) 2418 if (freq) {
2402 goto done; 2419 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2420 if (ctx)
2421 perf_ctx_adjust_freq(ctx, interval);
2422 }
2403 2423
2404 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2424 if (rotate) {
2405 if (ctx) 2425 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2406 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2426 if (ctx)
2427 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2407 2428
2408 rotate_ctx(&cpuctx->ctx); 2429 rotate_ctx(&cpuctx->ctx);
2409 if (ctx) 2430 if (ctx)
2410 rotate_ctx(ctx); 2431 rotate_ctx(ctx);
2432
2433 perf_event_sched_in(cpuctx, ctx, current);
2434 }
2411 2435
2412 perf_event_sched_in(cpuctx, ctx, current); 2436 perf_pmu_enable(cpuctx->ctx.pmu);
2437 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2413 2438
2414done: 2439done:
2415 if (remove) 2440 if (remove)
2416 list_del_init(&cpuctx->rotation_list); 2441 list_del_init(&cpuctx->rotation_list);
2417
2418 perf_pmu_enable(cpuctx->ctx.pmu);
2419 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2420} 2442}
2421 2443
2422void perf_event_task_tick(void) 2444void perf_event_task_tick(void)
@@ -2443,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
2443 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2465 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2444 return 0; 2466 return 0;
2445 2467
2446 __perf_event_mark_enabled(event, ctx); 2468 __perf_event_mark_enabled(event);
2447 2469
2448 return 1; 2470 return 1;
2449} 2471}
@@ -2475,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2475 raw_spin_lock(&ctx->lock); 2497 raw_spin_lock(&ctx->lock);
2476 task_ctx_sched_out(ctx); 2498 task_ctx_sched_out(ctx);
2477 2499
2478 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2500 list_for_each_entry(event, &ctx->event_list, event_entry) {
2479 ret = event_enable_on_exec(event, ctx);
2480 if (ret)
2481 enabled = 1;
2482 }
2483
2484 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2485 ret = event_enable_on_exec(event, ctx); 2501 ret = event_enable_on_exec(event, ctx);
2486 if (ret) 2502 if (ret)
2487 enabled = 1; 2503 enabled = 1;
@@ -2569,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
2569} 2585}
2570 2586
2571/* 2587/*
2572 * Callchain support
2573 */
2574
2575struct callchain_cpus_entries {
2576 struct rcu_head rcu_head;
2577 struct perf_callchain_entry *cpu_entries[0];
2578};
2579
2580static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2581static atomic_t nr_callchain_events;
2582static DEFINE_MUTEX(callchain_mutex);
2583struct callchain_cpus_entries *callchain_cpus_entries;
2584
2585
2586__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2587 struct pt_regs *regs)
2588{
2589}
2590
2591__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2592 struct pt_regs *regs)
2593{
2594}
2595
2596static void release_callchain_buffers_rcu(struct rcu_head *head)
2597{
2598 struct callchain_cpus_entries *entries;
2599 int cpu;
2600
2601 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2602
2603 for_each_possible_cpu(cpu)
2604 kfree(entries->cpu_entries[cpu]);
2605
2606 kfree(entries);
2607}
2608
2609static void release_callchain_buffers(void)
2610{
2611 struct callchain_cpus_entries *entries;
2612
2613 entries = callchain_cpus_entries;
2614 rcu_assign_pointer(callchain_cpus_entries, NULL);
2615 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2616}
2617
2618static int alloc_callchain_buffers(void)
2619{
2620 int cpu;
2621 int size;
2622 struct callchain_cpus_entries *entries;
2623
2624 /*
2625 * We can't use the percpu allocation API for data that can be
2626 * accessed from NMI. Use a temporary manual per cpu allocation
2627 * until that gets sorted out.
2628 */
2629 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2630
2631 entries = kzalloc(size, GFP_KERNEL);
2632 if (!entries)
2633 return -ENOMEM;
2634
2635 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2636
2637 for_each_possible_cpu(cpu) {
2638 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2639 cpu_to_node(cpu));
2640 if (!entries->cpu_entries[cpu])
2641 goto fail;
2642 }
2643
2644 rcu_assign_pointer(callchain_cpus_entries, entries);
2645
2646 return 0;
2647
2648fail:
2649 for_each_possible_cpu(cpu)
2650 kfree(entries->cpu_entries[cpu]);
2651 kfree(entries);
2652
2653 return -ENOMEM;
2654}
2655
2656static int get_callchain_buffers(void)
2657{
2658 int err = 0;
2659 int count;
2660
2661 mutex_lock(&callchain_mutex);
2662
2663 count = atomic_inc_return(&nr_callchain_events);
2664 if (WARN_ON_ONCE(count < 1)) {
2665 err = -EINVAL;
2666 goto exit;
2667 }
2668
2669 if (count > 1) {
2670 /* If the allocation failed, give up */
2671 if (!callchain_cpus_entries)
2672 err = -ENOMEM;
2673 goto exit;
2674 }
2675
2676 err = alloc_callchain_buffers();
2677 if (err)
2678 release_callchain_buffers();
2679exit:
2680 mutex_unlock(&callchain_mutex);
2681
2682 return err;
2683}
2684
2685static void put_callchain_buffers(void)
2686{
2687 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2688 release_callchain_buffers();
2689 mutex_unlock(&callchain_mutex);
2690 }
2691}
2692
2693static int get_recursion_context(int *recursion)
2694{
2695 int rctx;
2696
2697 if (in_nmi())
2698 rctx = 3;
2699 else if (in_irq())
2700 rctx = 2;
2701 else if (in_softirq())
2702 rctx = 1;
2703 else
2704 rctx = 0;
2705
2706 if (recursion[rctx])
2707 return -1;
2708
2709 recursion[rctx]++;
2710 barrier();
2711
2712 return rctx;
2713}
2714
2715static inline void put_recursion_context(int *recursion, int rctx)
2716{
2717 barrier();
2718 recursion[rctx]--;
2719}
2720
2721static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2722{
2723 int cpu;
2724 struct callchain_cpus_entries *entries;
2725
2726 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2727 if (*rctx == -1)
2728 return NULL;
2729
2730 entries = rcu_dereference(callchain_cpus_entries);
2731 if (!entries)
2732 return NULL;
2733
2734 cpu = smp_processor_id();
2735
2736 return &entries->cpu_entries[cpu][*rctx];
2737}
2738
2739static void
2740put_callchain_entry(int rctx)
2741{
2742 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2743}
2744
2745static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2746{
2747 int rctx;
2748 struct perf_callchain_entry *entry;
2749
2750
2751 entry = get_callchain_entry(&rctx);
2752 if (rctx == -1)
2753 return NULL;
2754
2755 if (!entry)
2756 goto exit_put;
2757
2758 entry->nr = 0;
2759
2760 if (!user_mode(regs)) {
2761 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2762 perf_callchain_kernel(entry, regs);
2763 if (current->mm)
2764 regs = task_pt_regs(current);
2765 else
2766 regs = NULL;
2767 }
2768
2769 if (regs) {
2770 perf_callchain_store(entry, PERF_CONTEXT_USER);
2771 perf_callchain_user(entry, regs);
2772 }
2773
2774exit_put:
2775 put_callchain_entry(rctx);
2776
2777 return entry;
2778}
2779
2780/*
2781 * Initialize the perf_event context in a task_struct: 2588 * Initialize the perf_event context in a task_struct:
2782 */ 2589 */
2783static void __perf_event_init_context(struct perf_event_context *ctx) 2590static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2941,7 +2748,7 @@ static void free_event(struct perf_event *event)
2941 2748
2942 if (!event->parent) { 2749 if (!event->parent) {
2943 if (event->attach_state & PERF_ATTACH_TASK) 2750 if (event->attach_state & PERF_ATTACH_TASK)
2944 jump_label_dec(&perf_sched_events); 2751 jump_label_dec_deferred(&perf_sched_events);
2945 if (event->attr.mmap || event->attr.mmap_data) 2752 if (event->attr.mmap || event->attr.mmap_data)
2946 atomic_dec(&nr_mmap_events); 2753 atomic_dec(&nr_mmap_events);
2947 if (event->attr.comm) 2754 if (event->attr.comm)
@@ -2952,7 +2759,7 @@ static void free_event(struct perf_event *event)
2952 put_callchain_buffers(); 2759 put_callchain_buffers();
2953 if (is_cgroup_event(event)) { 2760 if (is_cgroup_event(event)) {
2954 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2761 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2955 jump_label_dec(&perf_sched_events); 2762 jump_label_dec_deferred(&perf_sched_events);
2956 } 2763 }
2957 } 2764 }
2958 2765
@@ -3189,12 +2996,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3189 struct ring_buffer *rb; 2996 struct ring_buffer *rb;
3190 unsigned int events = POLL_HUP; 2997 unsigned int events = POLL_HUP;
3191 2998
2999 /*
3000 * Race between perf_event_set_output() and perf_poll(): perf_poll()
3001 * grabs the rb reference but perf_event_set_output() overrides it.
3002 * Here is the timeline for two threads T1, T2:
3003 * t0: T1, rb = rcu_dereference(event->rb)
3004 * t1: T2, old_rb = event->rb
3005 * t2: T2, event->rb = new rb
3006 * t3: T2, ring_buffer_detach(old_rb)
3007 * t4: T1, ring_buffer_attach(rb1)
3008 * t5: T1, poll_wait(event->waitq)
3009 *
3010 * To avoid this problem, we grab mmap_mutex in perf_poll()
3011 * thereby ensuring that the assignment of the new ring buffer
3012 * and the detachment of the old buffer appear atomic to perf_poll()
3013 */
3014 mutex_lock(&event->mmap_mutex);
3015
3192 rcu_read_lock(); 3016 rcu_read_lock();
3193 rb = rcu_dereference(event->rb); 3017 rb = rcu_dereference(event->rb);
3194 if (rb) 3018 if (rb) {
3019 ring_buffer_attach(event, rb);
3195 events = atomic_xchg(&rb->poll, 0); 3020 events = atomic_xchg(&rb->poll, 0);
3021 }
3196 rcu_read_unlock(); 3022 rcu_read_unlock();
3197 3023
3024 mutex_unlock(&event->mmap_mutex);
3025
3198 poll_wait(file, &event->waitq, wait); 3026 poll_wait(file, &event->waitq, wait);
3199 3027
3200 return events; 3028 return events;
@@ -3495,6 +3323,53 @@ unlock:
3495 return ret; 3323 return ret;
3496} 3324}
3497 3325
3326static void ring_buffer_attach(struct perf_event *event,
3327 struct ring_buffer *rb)
3328{
3329 unsigned long flags;
3330
3331 if (!list_empty(&event->rb_entry))
3332 return;
3333
3334 spin_lock_irqsave(&rb->event_lock, flags);
3335 if (!list_empty(&event->rb_entry))
3336 goto unlock;
3337
3338 list_add(&event->rb_entry, &rb->event_list);
3339unlock:
3340 spin_unlock_irqrestore(&rb->event_lock, flags);
3341}
3342
3343static void ring_buffer_detach(struct perf_event *event,
3344 struct ring_buffer *rb)
3345{
3346 unsigned long flags;
3347
3348 if (list_empty(&event->rb_entry))
3349 return;
3350
3351 spin_lock_irqsave(&rb->event_lock, flags);
3352 list_del_init(&event->rb_entry);
3353 wake_up_all(&event->waitq);
3354 spin_unlock_irqrestore(&rb->event_lock, flags);
3355}
3356
3357static void ring_buffer_wakeup(struct perf_event *event)
3358{
3359 struct ring_buffer *rb;
3360
3361 rcu_read_lock();
3362 rb = rcu_dereference(event->rb);
3363 if (!rb)
3364 goto unlock;
3365
3366 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3367 wake_up_all(&event->waitq);
3368
3369unlock:
3370 rcu_read_unlock();
3371}
3372
3498static void rb_free_rcu(struct rcu_head *rcu_head) 3373static void rb_free_rcu(struct rcu_head *rcu_head)
3499{ 3374{
3500 struct ring_buffer *rb; 3375 struct ring_buffer *rb;
@@ -3520,9 +3395,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3520 3395
3521static void ring_buffer_put(struct ring_buffer *rb) 3396static void ring_buffer_put(struct ring_buffer *rb)
3522{ 3397{
3398 struct perf_event *event, *n;
3399 unsigned long flags;
3400
3523 if (!atomic_dec_and_test(&rb->refcount)) 3401 if (!atomic_dec_and_test(&rb->refcount))
3524 return; 3402 return;
3525 3403
3404 spin_lock_irqsave(&rb->event_lock, flags);
3405 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3406 list_del_init(&event->rb_entry);
3407 wake_up_all(&event->waitq);
3408 }
3409 spin_unlock_irqrestore(&rb->event_lock, flags);
3410
3526 call_rcu(&rb->rcu_head, rb_free_rcu); 3411 call_rcu(&rb->rcu_head, rb_free_rcu);
3527} 3412}
3528 3413
@@ -3543,8 +3428,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3543 struct ring_buffer *rb = event->rb; 3428 struct ring_buffer *rb = event->rb;
3544 3429
3545 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3430 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3546 vma->vm_mm->locked_vm -= event->mmap_locked; 3431 vma->vm_mm->pinned_vm -= event->mmap_locked;
3547 rcu_assign_pointer(event->rb, NULL); 3432 rcu_assign_pointer(event->rb, NULL);
3433 ring_buffer_detach(event, rb);
3548 mutex_unlock(&event->mmap_mutex); 3434 mutex_unlock(&event->mmap_mutex);
3549 3435
3550 ring_buffer_put(rb); 3436 ring_buffer_put(rb);
@@ -3624,7 +3510,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3624 3510
3625 lock_limit = rlimit(RLIMIT_MEMLOCK); 3511 lock_limit = rlimit(RLIMIT_MEMLOCK);
3626 lock_limit >>= PAGE_SHIFT; 3512 lock_limit >>= PAGE_SHIFT;
3627 locked = vma->vm_mm->locked_vm + extra; 3513 locked = vma->vm_mm->pinned_vm + extra;
3628 3514
3629 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 3515 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3630 !capable(CAP_IPC_LOCK)) { 3516 !capable(CAP_IPC_LOCK)) {
@@ -3650,7 +3536,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3650 atomic_long_add(user_extra, &user->locked_vm); 3536 atomic_long_add(user_extra, &user->locked_vm);
3651 event->mmap_locked = extra; 3537 event->mmap_locked = extra;
3652 event->mmap_user = get_current_user(); 3538 event->mmap_user = get_current_user();
3653 vma->vm_mm->locked_vm += event->mmap_locked; 3539 vma->vm_mm->pinned_vm += event->mmap_locked;
3654 3540
3655unlock: 3541unlock:
3656 if (!ret) 3542 if (!ret)
@@ -3699,7 +3585,7 @@ static const struct file_operations perf_fops = {
3699 3585
3700void perf_event_wakeup(struct perf_event *event) 3586void perf_event_wakeup(struct perf_event *event)
3701{ 3587{
3702 wake_up_all(&event->waitq); 3588 ring_buffer_wakeup(event);
3703 3589
3704 if (event->pending_kill) { 3590 if (event->pending_kill) {
3705 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 3591 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -4736,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4736 struct hw_perf_event *hwc = &event->hw; 4622 struct hw_perf_event *hwc = &event->hw;
4737 int throttle = 0; 4623 int throttle = 0;
4738 4624
4739 data->period = event->hw.last_period;
4740 if (!overflow) 4625 if (!overflow)
4741 overflow = perf_swevent_set_period(event); 4626 overflow = perf_swevent_set_period(event);
4742 4627
@@ -4770,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4770 if (!is_sampling_event(event)) 4655 if (!is_sampling_event(event))
4771 return; 4656 return;
4772 4657
4658 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4659 data->period = nr;
4660 return perf_swevent_overflow(event, 1, data, regs);
4661 } else
4662 data->period = event->hw.last_period;
4663
4773 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4664 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4774 return perf_swevent_overflow(event, 1, data, regs); 4665 return perf_swevent_overflow(event, 1, data, regs);
4775 4666
@@ -5282,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5282 regs = get_irq_regs(); 5173 regs = get_irq_regs();
5283 5174
5284 if (regs && !perf_exclude_event(event, regs)) { 5175 if (regs && !perf_exclude_event(event, regs)) {
5285 if (!(event->attr.exclude_idle && current->pid == 0)) 5176 if (!(event->attr.exclude_idle && is_idle_task(current)))
5286 if (perf_event_overflow(event, &data, regs)) 5177 if (perf_event_overflow(event, &data, regs))
5287 ret = HRTIMER_NORESTART; 5178 ret = HRTIMER_NORESTART;
5288 } 5179 }
@@ -5758,6 +5649,7 @@ struct pmu *perf_init_event(struct perf_event *event)
5758 pmu = idr_find(&pmu_idr, event->attr.type); 5649 pmu = idr_find(&pmu_idr, event->attr.type);
5759 rcu_read_unlock(); 5650 rcu_read_unlock();
5760 if (pmu) { 5651 if (pmu) {
5652 event->pmu = pmu;
5761 ret = pmu->event_init(event); 5653 ret = pmu->event_init(event);
5762 if (ret) 5654 if (ret)
5763 pmu = ERR_PTR(ret); 5655 pmu = ERR_PTR(ret);
@@ -5765,6 +5657,7 @@ struct pmu *perf_init_event(struct perf_event *event)
5765 } 5657 }
5766 5658
5767 list_for_each_entry_rcu(pmu, &pmus, entry) { 5659 list_for_each_entry_rcu(pmu, &pmus, entry) {
5660 event->pmu = pmu;
5768 ret = pmu->event_init(event); 5661 ret = pmu->event_init(event);
5769 if (!ret) 5662 if (!ret)
5770 goto unlock; 5663 goto unlock;
@@ -5819,6 +5712,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5819 INIT_LIST_HEAD(&event->group_entry); 5712 INIT_LIST_HEAD(&event->group_entry);
5820 INIT_LIST_HEAD(&event->event_entry); 5713 INIT_LIST_HEAD(&event->event_entry);
5821 INIT_LIST_HEAD(&event->sibling_list); 5714 INIT_LIST_HEAD(&event->sibling_list);
5715 INIT_LIST_HEAD(&event->rb_entry);
5716
5822 init_waitqueue_head(&event->waitq); 5717 init_waitqueue_head(&event->waitq);
5823 init_irq_work(&event->pending, perf_pending_event); 5718 init_irq_work(&event->pending, perf_pending_event);
5824 5719
@@ -5891,11 +5786,9 @@ done:
5891 return ERR_PTR(err); 5786 return ERR_PTR(err);
5892 } 5787 }
5893 5788
5894 event->pmu = pmu;
5895
5896 if (!event->parent) { 5789 if (!event->parent) {
5897 if (event->attach_state & PERF_ATTACH_TASK) 5790 if (event->attach_state & PERF_ATTACH_TASK)
5898 jump_label_inc(&perf_sched_events); 5791 jump_label_inc(&perf_sched_events.key);
5899 if (event->attr.mmap || event->attr.mmap_data) 5792 if (event->attr.mmap || event->attr.mmap_data)
5900 atomic_inc(&nr_mmap_events); 5793 atomic_inc(&nr_mmap_events);
5901 if (event->attr.comm) 5794 if (event->attr.comm)
@@ -6027,6 +5920,8 @@ set:
6027 5920
6028 old_rb = event->rb; 5921 old_rb = event->rb;
6029 rcu_assign_pointer(event->rb, rb); 5922 rcu_assign_pointer(event->rb, rb);
5923 if (old_rb)
5924 ring_buffer_detach(event, old_rb);
6030 ret = 0; 5925 ret = 0;
6031unlock: 5926unlock:
6032 mutex_unlock(&event->mmap_mutex); 5927 mutex_unlock(&event->mmap_mutex);
@@ -6131,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
6131 * - that may need work on context switch 6026 * - that may need work on context switch
6132 */ 6027 */
6133 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6028 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6134 jump_label_inc(&perf_sched_events); 6029 jump_label_inc(&perf_sched_events.key);
6135 } 6030 }
6136 6031
6137 /* 6032 /*
@@ -6977,6 +6872,9 @@ void __init perf_event_init(void)
6977 6872
6978 ret = init_hw_breakpoint(); 6873 ret = init_hw_breakpoint();
6979 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6874 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6875
6876 /* do not patch jump label more than once per second */
6877 jump_label_rate_limit(&perf_sched_events, HZ);
6980} 6878}
6981 6879
6982static int __init perf_event_sysfs_init(void) 6880static int __init perf_event_sysfs_init(void)
@@ -7043,10 +6941,13 @@ static int __perf_cgroup_move(void *info)
7043 return 0; 6941 return 0;
7044} 6942}
7045 6943
7046static void 6944static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7047perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) 6945 struct cgroup_taskset *tset)
7048{ 6946{
7049 task_function_call(task, __perf_cgroup_move, task); 6947 struct task_struct *task;
6948
6949 cgroup_taskset_for_each(task, cgrp, tset)
6950 task_function_call(task, __perf_cgroup_move, task);
7050} 6951}
7051 6952
7052static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 6953static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7060,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7060 if (!(task->flags & PF_EXITING)) 6961 if (!(task->flags & PF_EXITING))
7061 return; 6962 return;
7062 6963
7063 perf_cgroup_attach_task(cgrp, task); 6964 task_function_call(task, __perf_cgroup_move, task);
7064} 6965}
7065 6966
7066struct cgroup_subsys perf_subsys = { 6967struct cgroup_subsys perf_subsys = {
@@ -7069,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {
7069 .create = perf_cgroup_create, 6970 .create = perf_cgroup_create,
7070 .destroy = perf_cgroup_destroy, 6971 .destroy = perf_cgroup_destroy,
7071 .exit = perf_cgroup_exit, 6972 .exit = perf_cgroup_exit,
7072 .attach_task = perf_cgroup_attach_task, 6973 .attach = perf_cgroup_attach,
7073}; 6974};
7074#endif /* CONFIG_CGROUP_PERF */ 6975#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09097dd8116c..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5
6/* Buffer handling */
7
4#define RING_BUFFER_WRITABLE 0x01 8#define RING_BUFFER_WRITABLE 0x01
5 9
6struct ring_buffer { 10struct ring_buffer {
@@ -22,6 +26,9 @@ struct ring_buffer {
22 local_t lost; /* nr records lost */ 26 local_t lost; /* nr records lost */
23 27
24 long watermark; /* wakeup watermark */ 28 long watermark; /* wakeup watermark */
29 /* poll crap */
30 spinlock_t event_lock;
31 struct list_head event_list;
25 32
26 struct perf_event_mmap_page *user_page; 33 struct perf_event_mmap_page *user_page;
27 void *data_pages[0]; 34 void *data_pages[0];
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
64} 71}
65#endif 72#endif
66 73
67static unsigned long perf_data_size(struct ring_buffer *rb) 74static inline unsigned long perf_data_size(struct ring_buffer *rb)
68{ 75{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70} 77}
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
93 } while (len); 100 } while (len);
94} 101}
95 102
103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
105extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void);
107
108static inline int get_recursion_context(int *recursion)
109{
110 int rctx;
111
112 if (in_nmi())
113 rctx = 3;
114 else if (in_irq())
115 rctx = 2;
116 else if (in_softirq())
117 rctx = 1;
118 else
119 rctx = 0;
120
121 if (recursion[rctx])
122 return -1;
123
124 recursion[rctx]++;
125 barrier();
126
127 return rctx;
128}
129
130static inline void put_recursion_context(int *recursion, int rctx)
131{
132 barrier();
133 recursion[rctx]--;
134}
135
96#endif /* _KERNEL_EVENTS_INTERNAL_H */ 136#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
209 rb->writable = 1; 209 rb->writable = 1;
210 210
211 atomic_set(&rb->refcount, 1); 211 atomic_set(&rb->refcount, 1);
212
213 INIT_LIST_HEAD(&rb->event_list);
214 spin_lock_init(&rb->event_lock);
212} 215}
213 216
214#ifndef CONFIG_PERF_USE_VMALLOC 217#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d42..c44738267be7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 122 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 123 * will have been the last reference on the signal_struct.
123 */ 124 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 125 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 126 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 127 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 128 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 129 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 130 sig->nvcsw += tsk->nvcsw;
@@ -679,10 +680,6 @@ static void exit_mm(struct task_struct * tsk)
679 tsk->mm = NULL; 680 tsk->mm = NULL;
680 up_read(&mm->mmap_sem); 681 up_read(&mm->mmap_sem);
681 enter_lazy_tlb(mm, current); 682 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk);
684 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
685 atomic_dec(&mm->oom_disable_count);
686 task_unlock(tsk); 683 task_unlock(tsk);
687 mm_update_next_owner(mm); 684 mm_update_next_owner(mm);
688 mmput(mm); 685 mmput(mm);
@@ -890,7 +887,7 @@ static void check_stack_usage(void)
890static inline void check_stack_usage(void) {} 887static inline void check_stack_usage(void) {}
891#endif 888#endif
892 889
893NORET_TYPE void do_exit(long code) 890void do_exit(long code)
894{ 891{
895 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
896 int group_dead; 893 int group_dead;
@@ -1039,9 +1036,12 @@ NORET_TYPE void do_exit(long code)
1039 validate_creds_for_do_exit(tsk); 1036 validate_creds_for_do_exit(tsk);
1040 1037
1041 preempt_disable(); 1038 preempt_disable();
1039 if (tsk->nr_dirtied)
1040 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1042 exit_rcu(); 1041 exit_rcu();
1043 /* causes final put_task_struct in finish_task_switch(). */ 1042 /* causes final put_task_struct in finish_task_switch(). */
1044 tsk->state = TASK_DEAD; 1043 tsk->state = TASK_DEAD;
1044 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
1045 schedule(); 1045 schedule();
1046 BUG(); 1046 BUG();
1047 /* Avoid "noreturn function does return". */ 1047 /* Avoid "noreturn function does return". */
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
1051 1051
1052EXPORT_SYMBOL_GPL(do_exit); 1052EXPORT_SYMBOL_GPL(do_exit);
1053 1053
1054NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1054void complete_and_exit(struct completion *comp, long code)
1055{ 1055{
1056 if (comp) 1056 if (comp)
1057 complete(comp); 1057 complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
1070 * Take down every thread in the group. This is called by fatal signals 1070 * Take down every thread in the group. This is called by fatal signals
1071 * as well as by sys_exit_group (below). 1071 * as well as by sys_exit_group (below).
1072 */ 1072 */
1073NORET_TYPE void 1073void
1074do_group_exit(int exit_code) 1074do_group_exit(int exit_code)
1075{ 1075{
1076 struct signal_struct *sig = current->signal; 1076 struct signal_struct *sig = current->signal;
@@ -1257,19 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1257 spin_lock_irq(&p->real_parent->sighand->siglock); 1257 spin_lock_irq(&p->real_parent->sighand->siglock);
1258 psig = p->real_parent->signal; 1258 psig = p->real_parent->signal;
1259 sig = p->signal; 1259 sig = p->signal;
1260 psig->cutime = 1260 psig->cutime += tgutime + sig->cutime;
1261 cputime_add(psig->cutime, 1261 psig->cstime += tgstime + sig->cstime;
1262 cputime_add(tgutime, 1262 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1263 sig->cutime));
1264 psig->cstime =
1265 cputime_add(psig->cstime,
1266 cputime_add(tgstime,
1267 sig->cstime));
1268 psig->cgtime =
1269 cputime_add(psig->cgtime,
1270 cputime_add(p->gtime,
1271 cputime_add(sig->gtime,
1272 sig->cgtime)));
1273 psig->cmin_flt += 1263 psig->cmin_flt +=
1274 p->min_flt + sig->min_flt + sig->cmin_flt; 1264 p->min_flt + sig->min_flt + sig->cmin_flt;
1275 psig->cmaj_flt += 1265 psig->cmaj_flt +=
@@ -1542,8 +1532,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1542 } 1532 }
1543 1533
1544 /* dead body doesn't have much to contribute */ 1534 /* dead body doesn't have much to contribute */
1545 if (p->exit_state == EXIT_DEAD) 1535 if (unlikely(p->exit_state == EXIT_DEAD)) {
1536 /*
1537 * But do not ignore this task until the tracer does
1538 * wait_task_zombie()->do_notify_parent().
1539 */
1540 if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1541 wo->notask_error = 0;
1546 return 0; 1542 return 0;
1543 }
1547 1544
1548 /* slay zombie? */ 1545 /* slay zombie? */
1549 if (p->exit_state == EXIT_ZOMBIE) { 1546 if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4fb272..443f5125f11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
76 76
77#include <trace/events/sched.h> 77#include <trace/events/sched.h>
78 78
79#define CREATE_TRACE_POINTS
80#include <trace/events/task.h>
81
79/* 82/*
80 * Protected counters by write_lock_irq(&tasklist_lock) 83 * Protected counters by write_lock_irq(&tasklist_lock)
81 */ 84 */
@@ -162,7 +165,6 @@ static void account_kernel_stack(struct thread_info *ti, int account)
162 165
163void free_task(struct task_struct *tsk) 166void free_task(struct task_struct *tsk)
164{ 167{
165 prop_local_destroy_single(&tsk->dirties);
166 account_kernel_stack(tsk->stack, -1); 168 account_kernel_stack(tsk->stack, -1);
167 free_thread_info(tsk->stack); 169 free_thread_info(tsk->stack);
168 rt_mutex_debug_task_free(tsk); 170 rt_mutex_debug_task_free(tsk);
@@ -274,10 +276,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
274 276
275 tsk->stack = ti; 277 tsk->stack = ti;
276 278
277 err = prop_local_init_single(&tsk->dirties);
278 if (err)
279 goto out;
280
281 setup_thread_stack(tsk, orig); 279 setup_thread_stack(tsk, orig);
282 clear_user_return_notifier(tsk); 280 clear_user_return_notifier(tsk);
283 clear_tsk_need_resched(tsk); 281 clear_tsk_need_resched(tsk);
@@ -501,7 +499,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
501 mm->cached_hole_size = ~0UL; 499 mm->cached_hole_size = ~0UL;
502 mm_init_aio(mm); 500 mm_init_aio(mm);
503 mm_init_owner(mm, p); 501 mm_init_owner(mm, p);
504 atomic_set(&mm->oom_disable_count, 0);
505 502
506 if (likely(!mm_alloc_pgd(mm))) { 503 if (likely(!mm_alloc_pgd(mm))) {
507 mm->def_flags = 0; 504 mm->def_flags = 0;
@@ -816,8 +813,6 @@ good_mm:
816 /* Initializing for Swap token stuff */ 813 /* Initializing for Swap token stuff */
817 mm->token_priority = 0; 814 mm->token_priority = 0;
818 mm->last_interval = 0; 815 mm->last_interval = 0;
819 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
820 atomic_inc(&mm->oom_disable_count);
821 816
822 tsk->mm = mm; 817 tsk->mm = mm;
823 tsk->active_mm = mm; 818 tsk->active_mm = mm;
@@ -980,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
980 sched_autogroup_fork(sig); 975 sched_autogroup_fork(sig);
981 976
982#ifdef CONFIG_CGROUPS 977#ifdef CONFIG_CGROUPS
983 init_rwsem(&sig->threadgroup_fork_lock); 978 init_rwsem(&sig->group_rwsem);
984#endif 979#endif
985 980
986 sig->oom_adj = current->signal->oom_adj; 981 sig->oom_adj = current->signal->oom_adj;
@@ -1000,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
1000 new_flags |= PF_FORKNOEXEC; 995 new_flags |= PF_FORKNOEXEC;
1001 new_flags |= PF_STARTING; 996 new_flags |= PF_STARTING;
1002 p->flags = new_flags; 997 p->flags = new_flags;
1003 clear_freeze_flag(p);
1004} 998}
1005 999
1006SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1000SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1031,8 +1025,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1031 */ 1025 */
1032static void posix_cpu_timers_init(struct task_struct *tsk) 1026static void posix_cpu_timers_init(struct task_struct *tsk)
1033{ 1027{
1034 tsk->cputime_expires.prof_exp = cputime_zero; 1028 tsk->cputime_expires.prof_exp = 0;
1035 tsk->cputime_expires.virt_exp = cputime_zero; 1029 tsk->cputime_expires.virt_exp = 0;
1036 tsk->cputime_expires.sched_exp = 0; 1030 tsk->cputime_expires.sched_exp = 0;
1037 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1031 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1038 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1032 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1140,14 +1134,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1140 1134
1141 init_sigpending(&p->pending); 1135 init_sigpending(&p->pending);
1142 1136
1143 p->utime = cputime_zero; 1137 p->utime = p->stime = p->gtime = 0;
1144 p->stime = cputime_zero; 1138 p->utimescaled = p->stimescaled = 0;
1145 p->gtime = cputime_zero;
1146 p->utimescaled = cputime_zero;
1147 p->stimescaled = cputime_zero;
1148#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1139#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1149 p->prev_utime = cputime_zero; 1140 p->prev_utime = p->prev_stime = 0;
1150 p->prev_stime = cputime_zero;
1151#endif 1141#endif
1152#if defined(SPLIT_RSS_COUNTING) 1142#if defined(SPLIT_RSS_COUNTING)
1153 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1143 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1166,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1166 p->io_context = NULL; 1156 p->io_context = NULL;
1167 p->audit_context = NULL; 1157 p->audit_context = NULL;
1168 if (clone_flags & CLONE_THREAD) 1158 if (clone_flags & CLONE_THREAD)
1169 threadgroup_fork_read_lock(current); 1159 threadgroup_change_begin(current);
1170 cgroup_fork(p); 1160 cgroup_fork(p);
1171#ifdef CONFIG_NUMA 1161#ifdef CONFIG_NUMA
1172 p->mempolicy = mpol_dup(p->mempolicy); 1162 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1302,6 +1292,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1302 p->pdeath_signal = 0; 1292 p->pdeath_signal = 0;
1303 p->exit_state = 0; 1293 p->exit_state = 0;
1304 1294
1295 p->nr_dirtied = 0;
1296 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1297 p->dirty_paused_when = 0;
1298
1305 /* 1299 /*
1306 * Ok, make it visible to the rest of the system. 1300 * Ok, make it visible to the rest of the system.
1307 * We dont wake it up yet. 1301 * We dont wake it up yet.
@@ -1378,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1378 proc_fork_connector(p); 1372 proc_fork_connector(p);
1379 cgroup_post_fork(p); 1373 cgroup_post_fork(p);
1380 if (clone_flags & CLONE_THREAD) 1374 if (clone_flags & CLONE_THREAD)
1381 threadgroup_fork_read_unlock(current); 1375 threadgroup_change_end(current);
1382 perf_event_fork(p); 1376 perf_event_fork(p);
1377
1378 trace_task_newtask(p, clone_flags);
1379
1383 return p; 1380 return p;
1384 1381
1385bad_fork_free_pid: 1382bad_fork_free_pid:
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io:
1391bad_fork_cleanup_namespaces: 1388bad_fork_cleanup_namespaces:
1392 exit_task_namespaces(p); 1389 exit_task_namespaces(p);
1393bad_fork_cleanup_mm: 1390bad_fork_cleanup_mm:
1394 if (p->mm) { 1391 if (p->mm)
1395 task_lock(p);
1396 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1397 atomic_dec(&p->mm->oom_disable_count);
1398 task_unlock(p);
1399 mmput(p->mm); 1392 mmput(p->mm);
1400 }
1401bad_fork_cleanup_signal: 1393bad_fork_cleanup_signal:
1402 if (!(clone_flags & CLONE_THREAD)) 1394 if (!(clone_flags & CLONE_THREAD))
1403 free_signal_struct(p->signal); 1395 free_signal_struct(p->signal);
@@ -1418,7 +1410,7 @@ bad_fork_cleanup_policy:
1418bad_fork_cleanup_cgroup: 1410bad_fork_cleanup_cgroup:
1419#endif 1411#endif
1420 if (clone_flags & CLONE_THREAD) 1412 if (clone_flags & CLONE_THREAD)
1421 threadgroup_fork_read_unlock(current); 1413 threadgroup_change_end(current);
1422 cgroup_exit(p, cgroup_callbacks_done); 1414 cgroup_exit(p, cgroup_callbacks_done);
1423 delayacct_tsk_free(p); 1415 delayacct_tsk_free(p);
1424 module_put(task_thread_info(p)->exec_domain->module); 1416 module_put(task_thread_info(p)->exec_domain->module);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6a..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,104 +6,117 @@
6 6
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/suspend.h> 8#include <linux/suspend.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12#include <linux/kthread.h>
12 13
13/* 14/* total number of freezing conditions in effect */
14 * freezing is complete, mark current process as frozen 15atomic_t system_freezing_cnt = ATOMIC_INIT(0);
16EXPORT_SYMBOL(system_freezing_cnt);
17
18/* indicate whether PM freezing is in effect, protected by pm_mutex */
19bool pm_freezing;
20bool pm_nosig_freezing;
21
22/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock);
24
25/**
26 * freezing_slow_path - slow path for testing whether a task needs to be frozen
27 * @p: task to be tested
28 *
29 * This function is called by freezing() if system_freezing_cnt isn't zero
30 * and tests whether @p needs to enter and stay in frozen state. Can be
31 * called under any context. The freezers are responsible for ensuring the
32 * target tasks see the updated state.
15 */ 33 */
16static inline void frozen_process(void) 34bool freezing_slow_path(struct task_struct *p)
17{ 35{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 36 if (p->flags & PF_NOFREEZE)
19 current->flags |= PF_FROZEN; 37 return false;
20 smp_wmb(); 38
21 } 39 if (pm_nosig_freezing || cgroup_freezing(p))
22 clear_freeze_flag(current); 40 return true;
41
42 if (pm_freezing && !(p->flags & PF_KTHREAD))
43 return true;
44
45 return false;
23} 46}
47EXPORT_SYMBOL(freezing_slow_path);
24 48
25/* Refrigerator is place where frozen processes are stored :-). */ 49/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void) 50bool __refrigerator(bool check_kthr_stop)
27{ 51{
28 /* Hmm, should we be allowed to suspend when there are realtime 52 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */ 53 processes around? */
30 long save; 54 bool was_frozen = false;
55 long save = current->state;
31 56
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 57 pr_debug("%s entered refrigerator\n", current->comm);
42 58
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
50 for (;;) { 59 for (;;) {
51 set_current_state(TASK_UNINTERRUPTIBLE); 60 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current)) 61
62 spin_lock_irq(&freezer_lock);
63 current->flags |= PF_FROZEN;
64 if (!freezing(current) ||
65 (check_kthr_stop && kthread_should_stop()))
66 current->flags &= ~PF_FROZEN;
67 spin_unlock_irq(&freezer_lock);
68
69 if (!(current->flags & PF_FROZEN))
53 break; 70 break;
71 was_frozen = true;
54 schedule(); 72 schedule();
55 } 73 }
56 74
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
60 pr_debug("%s left refrigerator\n", current->comm); 75 pr_debug("%s left refrigerator\n", current->comm);
61 __set_current_state(save); 76
77 /*
78 * Restore saved task state before returning. The mb'd version
79 * needs to be used; otherwise, it might silently break
80 * synchronization which depends on ordered task state change.
81 */
82 set_current_state(save);
83
84 return was_frozen;
62} 85}
63EXPORT_SYMBOL(refrigerator); 86EXPORT_SYMBOL(__refrigerator);
64 87
65static void fake_signal_wake_up(struct task_struct *p) 88static void fake_signal_wake_up(struct task_struct *p)
66{ 89{
67 unsigned long flags; 90 unsigned long flags;
68 91
69 spin_lock_irqsave(&p->sighand->siglock, flags); 92 if (lock_task_sighand(p, &flags)) {
70 signal_wake_up(p, 0); 93 signal_wake_up(p, 0);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags); 94 unlock_task_sighand(p, &flags);
95 }
72} 96}
73 97
74/** 98/**
75 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
76 * @p: task to send the request to 100 * @p: task to send the request to
77 * @sig_only: if set, the request will only be sent if the task has the 101 *
78 * PF_FREEZER_NOSIG flag unset 102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
79 * Return value: 'false', if @sig_only is set and the task has 103 * flag and either sending a fake signal to it or waking it up, depending
80 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise 104 * on whether it has %PF_FREEZER_NOSIG set.
81 * 105 *
82 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and 106 * RETURNS:
83 * either sending a fake signal to it or waking it up, depending on whether 107 * %false, if @p is not freezing or already frozen; %true, otherwise
84 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
85 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
86 * TIF_FREEZE flag will not be set.
87 */ 108 */
88bool freeze_task(struct task_struct *p, bool sig_only) 109bool freeze_task(struct task_struct *p)
89{ 110{
90 /* 111 unsigned long flags;
91 * We first check if the task is freezing and next if it has already 112
92 * been frozen to avoid the race with frozen_process() which first marks 113 spin_lock_irqsave(&freezer_lock, flags);
93 * the task as frozen and next clears its TIF_FREEZE. 114 if (!freezing(p) || frozen(p)) {
94 */ 115 spin_unlock_irqrestore(&freezer_lock, flags);
95 if (!freezing(p)) { 116 return false;
96 smp_rmb();
97 if (frozen(p))
98 return false;
99
100 if (!sig_only || should_send_signal(p))
101 set_freeze_flag(p);
102 else
103 return false;
104 } 117 }
105 118
106 if (should_send_signal(p)) { 119 if (!(p->flags & PF_KTHREAD)) {
107 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
108 /* 121 /*
109 * fake_signal_wake_up() goes through p's scheduler 122 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
111 * TASK_RUNNING transition can't race with task state 124 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks(). 125 * testing in try_to_freeze_tasks().
113 */ 126 */
114 } else if (sig_only) {
115 return false;
116 } else { 127 } else {
117 wake_up_state(p, TASK_INTERRUPTIBLE); 128 wake_up_state(p, TASK_INTERRUPTIBLE);
118 } 129 }
119 130
131 spin_unlock_irqrestore(&freezer_lock, flags);
120 return true; 132 return true;
121} 133}
122 134
123void cancel_freezing(struct task_struct *p) 135void __thaw_task(struct task_struct *p)
124{ 136{
125 unsigned long flags; 137 unsigned long flags;
126 138
127 if (freezing(p)) { 139 /*
128 pr_debug(" clean up: %s\n", p->comm); 140 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
129 clear_freeze_flag(p); 141 * be visible to @p as waking up implies wmb. Waking up inside
130 spin_lock_irqsave(&p->sighand->siglock, flags); 142 * freezer_lock also prevents wakeups from leaking outside
131 recalc_sigpending_and_wake(p); 143 * refrigerator.
132 spin_unlock_irqrestore(&p->sighand->siglock, flags); 144 */
133 } 145 spin_lock_irqsave(&freezer_lock, flags);
134} 146 if (frozen(p))
135 147 wake_up_process(p);
136static int __thaw_process(struct task_struct *p) 148 spin_unlock_irqrestore(&freezer_lock, flags);
137{
138 if (frozen(p)) {
139 p->flags &= ~PF_FROZEN;
140 return 1;
141 }
142 clear_freeze_flag(p);
143 return 0;
144} 149}
145 150
146/* 151/**
147 * Wake up a frozen process 152 * set_freezable - make %current freezable
148 * 153 *
149 * task_lock() is needed to prevent the race with refrigerator() which may 154 * Mark %current freezable and enter refrigerator if necessary.
150 * occur if the freezing of tasks fails. Namely, without the lock, if the
151 * freezing of tasks failed, thaw_tasks() might have run before a task in
152 * refrigerator() could call frozen_process(), in which case the task would be
153 * frozen and no one would thaw it.
154 */ 155 */
155int thaw_process(struct task_struct *p) 156bool set_freezable(void)
156{ 157{
157 task_lock(p); 158 might_sleep();
158 if (__thaw_process(p) == 1) { 159
159 task_unlock(p); 160 /*
160 wake_up_process(p); 161 * Modify flags while holding freezer_lock. This ensures the
161 return 1; 162 * freezer notices that we aren't frozen yet or the freezing
162 } 163 * condition is visible to try_to_freeze() below.
163 task_unlock(p); 164 */
164 return 0; 165 spin_lock_irq(&freezer_lock);
166 current->flags &= ~PF_NOFREEZE;
167 spin_unlock_irq(&freezer_lock);
168
169 return try_to_freeze();
165} 170}
166EXPORT_SYMBOL(thaw_process); 171EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e8..1614be20173d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,7 +55,7 @@
55#include <linux/pagemap.h> 55#include <linux/pagemap.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/signal.h> 57#include <linux/signal.h>
58#include <linux/module.h> 58#include <linux/export.h>
59#include <linux/magic.h> 59#include <linux/magic.h>
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
@@ -314,17 +314,29 @@ again:
314#endif 314#endif
315 315
316 lock_page(page_head); 316 lock_page(page_head);
317
318 /*
319 * If page_head->mapping is NULL, then it cannot be a PageAnon
320 * page; but it might be the ZERO_PAGE or in the gate area or
321 * in a special mapping (all cases which we are happy to fail);
322 * or it may have been a good file page when get_user_pages_fast
323 * found it, but truncated or holepunched or subjected to
324 * invalidate_complete_page2 before we got the page lock (also
325 * cases which we are happy to fail). And we hold a reference,
326 * so refcount care in invalidate_complete_page's remove_mapping
327 * prevents drop_caches from setting mapping to NULL beneath us.
328 *
329 * The case we do have to guard against is when memory pressure made
330 * shmem_writepage move it from filecache to swapcache beneath us:
331 * an unlikely race, but we do need to retry for page_head->mapping.
332 */
317 if (!page_head->mapping) { 333 if (!page_head->mapping) {
334 int shmem_swizzled = PageSwapCache(page_head);
318 unlock_page(page_head); 335 unlock_page(page_head);
319 put_page(page_head); 336 put_page(page_head);
320 /* 337 if (shmem_swizzled)
321 * ZERO_PAGE pages don't have a mapping. Avoid a busy loop 338 goto again;
322 * trying to find one. RW mapping would have COW'd (and thus 339 return -EFAULT;
323 * have a mapping) so this page is RO and won't ever change.
324 */
325 if ((page_head == ZERO_PAGE(address)))
326 return -EFAULT;
327 goto again;
328 } 340 }
329 341
330 /* 342 /*
@@ -854,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
854{ 866{
855 struct task_struct *new_owner; 867 struct task_struct *new_owner;
856 struct futex_pi_state *pi_state = this->pi_state; 868 struct futex_pi_state *pi_state = this->pi_state;
857 u32 curval, newval; 869 u32 uninitialized_var(curval), newval;
858 870
859 if (!pi_state) 871 if (!pi_state)
860 return -EINVAL; 872 return -EINVAL;
@@ -916,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
916 928
917static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 929static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
918{ 930{
919 u32 oldval; 931 u32 uninitialized_var(oldval);
920 932
921 /* 933 /*
922 * There is no waiter, so we unlock the futex. The owner died 934 * There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1576 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1588 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1577 struct futex_pi_state *pi_state = q->pi_state; 1589 struct futex_pi_state *pi_state = q->pi_state;
1578 struct task_struct *oldowner = pi_state->owner; 1590 struct task_struct *oldowner = pi_state->owner;
1579 u32 uval, curval, newval; 1591 u32 uval, uninitialized_var(curval), newval;
1580 int ret; 1592 int ret;
1581 1593
1582 /* Owner died? */ 1594 /* Owner died? */
@@ -1793,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1793 * 1805 *
1794 * Returns: 1806 * Returns:
1795 * 0 - uaddr contains val and hb has been locked 1807 * 0 - uaddr contains val and hb has been locked
1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1808 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1797 */ 1809 */
1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1810static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1799 struct futex_q *q, struct futex_hash_bucket **hb) 1811 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2493,7 @@ err_unlock:
2481 */ 2493 */
2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2494int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2483{ 2495{
2484 u32 uval, nval, mval; 2496 u32 uval, uninitialized_var(nval), mval;
2485 2497
2486retry: 2498retry:
2487 if (get_user(uval, uaddr)) 2499 if (get_user(uval, uaddr))
diff --git a/kernel/groups.c b/kernel/groups.c
index 1cc476d52dd3..99b53d1eb7ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
2 * Supplementary group IDs 2 * Supplementary group IDs
3 */ 3 */
4#include <linux/cred.h> 4#include <linux/cred.h>
5#include <linux/module.h> 5#include <linux/export.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a9205e32a059..ae34bf51682b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/module.h> 35#include <linux/export.h>
36#include <linux/percpu.h> 36#include <linux/percpu.h>
37#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
885 struct hrtimer_clock_base *base, 885 struct hrtimer_clock_base *base,
886 unsigned long newstate, int reprogram) 886 unsigned long newstate, int reprogram)
887{ 887{
888 struct timerqueue_node *next_timer;
888 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 889 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
889 goto out; 890 goto out;
890 891
891 if (&timer->node == timerqueue_getnext(&base->active)) { 892 next_timer = timerqueue_getnext(&base->active);
893 timerqueue_del(&base->active, &timer->node);
894 if (&timer->node == next_timer) {
892#ifdef CONFIG_HIGH_RES_TIMERS 895#ifdef CONFIG_HIGH_RES_TIMERS
893 /* Reprogram the clock event device. if enabled */ 896 /* Reprogram the clock event device. if enabled */
894 if (reprogram && hrtimer_hres_active()) { 897 if (reprogram && hrtimer_hres_active()) {
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 } 904 }
902#endif 905#endif
903 } 906 }
904 timerqueue_del(&base->active, &timer->node);
905 if (!timerqueue_getnext(&base->active)) 907 if (!timerqueue_getnext(&base->active))
906 base->cpu_base->active_bases &= ~(1 << base->index); 908 base->cpu_base->active_bases &= ~(1 << base->index);
907out: 909out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab86..2e48ec0c2e91 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
13#include <linux/freezer.h> 13#include <linux/freezer.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18 18
19/* 19/*
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
74 74
75 /* 75 /*
76 * Ensure the task is not frozen. 76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes 77 * Also, skip vfork and any other user process that freezer should skip.
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */ 78 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count)) 79 if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
80 return;
81
82 /*
83 * When a freshly created task is scheduled once, changes its state to
84 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
85 * musn't be checked.
86 */
87 if (unlikely(!switch_count))
82 return; 88 return;
83 89
84 if (switch_count != t->last_switch_count) { 90 if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc5114b4c16c..f7c543a801d9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
26int irq_set_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
30 30
31 if (!desc) 31 if (!desc)
32 return -EINVAL; 32 return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
54int irq_set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
55{ 55{
56 unsigned long flags; 56 unsigned long flags;
57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
58 int ret = 0; 58 int ret = 0;
59 59
60 if (!desc) 60 if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
78int irq_set_handler_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
82 82
83 if (!desc) 83 if (!desc)
84 return -EINVAL; 84 return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
99{ 99{
100 unsigned long flags; 100 unsigned long flags;
101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
102 102
103 if (!desc) 103 if (!desc)
104 return -EINVAL; 104 return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
119int irq_set_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
120{ 120{
121 unsigned long flags; 121 unsigned long flags;
122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
123 123
124 if (!desc) 124 if (!desc)
125 return -EINVAL; 125 return -EINVAL;
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
204 } 204 }
205} 205}
206 206
207void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
208{
209 if (desc->irq_data.chip->irq_enable)
210 desc->irq_data.chip->irq_enable(&desc->irq_data);
211 else
212 desc->irq_data.chip->irq_unmask(&desc->irq_data);
213 cpumask_set_cpu(cpu, desc->percpu_enabled);
214}
215
216void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
217{
218 if (desc->irq_data.chip->irq_disable)
219 desc->irq_data.chip->irq_disable(&desc->irq_data);
220 else
221 desc->irq_data.chip->irq_mask(&desc->irq_data);
222 cpumask_clear_cpu(cpu, desc->percpu_enabled);
223}
224
207static inline void mask_ack_irq(struct irq_desc *desc) 225static inline void mask_ack_irq(struct irq_desc *desc)
208{ 226{
209 if (desc->irq_data.chip->irq_mask_ack) 227 if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 chip->irq_eoi(&desc->irq_data); 562 chip->irq_eoi(&desc->irq_data);
545} 563}
546 564
565/**
566 * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
567 * @irq: the interrupt number
568 * @desc: the interrupt description structure for this irq
569 *
570 * Per CPU interrupts on SMP machines without locking requirements. Same as
571 * handle_percpu_irq() above but with the following extras:
572 *
573 * action->percpu_dev_id is a pointer to percpu variables which
574 * contain the real device id for the cpu on which this handler is
575 * called
576 */
577void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
578{
579 struct irq_chip *chip = irq_desc_get_chip(desc);
580 struct irqaction *action = desc->action;
581 void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
582 irqreturn_t res;
583
584 kstat_incr_irqs_this_cpu(irq, desc);
585
586 if (chip->irq_ack)
587 chip->irq_ack(&desc->irq_data);
588
589 trace_irq_handler_entry(irq, action);
590 res = action->handler(irq, dev_id);
591 trace_irq_handler_exit(irq, action, res);
592
593 if (chip->irq_eoi)
594 chip->irq_eoi(&desc->irq_data);
595}
596
547void 597void
548__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 598__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
549 const char *name) 599 const char *name)
550{ 600{
551 unsigned long flags; 601 unsigned long flags;
552 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 602 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
553 603
554 if (!desc) 604 if (!desc)
555 return; 605 return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
593void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 643void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
594{ 644{
595 unsigned long flags; 645 unsigned long flags;
596 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 646 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
597 647
598 if (!desc) 648 if (!desc)
599 return; 649 return;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e38544dddb18..c89295a8f668 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,6 +6,7 @@
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h> 12#include <linux/syscore_ops.h>
@@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
211 } 212 }
212 return gc; 213 return gc;
213} 214}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
214 216
215/* 217/*
216 * Separate lockdep class for interrupt chip which can nest irq_desc 218 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
258 } 260 }
259 gc->irq_cnt = i - gc->irq_base; 261 gc->irq_cnt = i - gc->irq_base;
260} 262}
263EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
261 264
262/** 265/**
263 * irq_setup_alt_chip - Switch to alternative chip 266 * irq_setup_alt_chip - Switch to alternative chip
@@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
281 } 284 }
282 return -EINVAL; 285 return -EINVAL;
283} 286}
287EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
284 288
285/** 289/**
286 * irq_remove_generic_chip - Remove a chip 290 * irq_remove_generic_chip - Remove a chip
@@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
311 irq_modify_status(i, clr, set); 315 irq_modify_status(i, clr, set);
312 } 316 }
313} 317}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
314 319
315#ifdef CONFIG_PM 320#ifdef CONFIG_PM
316static int irq_gc_suspend(void) 321static int irq_gc_suspend(void)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d7..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18extern int noirqdebug; 18extern bool noirqdebug;
19 19
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
71extern void irq_shutdown(struct irq_desc *desc); 71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc); 72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc); 73extern void irq_disable(struct irq_desc *desc);
74extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
75extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
74extern void mask_irq(struct irq_desc *desc); 76extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc); 77extern void unmask_irq(struct irq_desc *desc);
76 78
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 116 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
115} 117}
116 118
119#define _IRQ_DESC_CHECK (1 << 0)
120#define _IRQ_DESC_PERCPU (1 << 1)
121
122#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
123#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
124
117struct irq_desc * 125struct irq_desc *
118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); 126__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
127 unsigned int check);
119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); 128void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
120 129
121static inline struct irq_desc * 130static inline struct irq_desc *
122irq_get_desc_buslock(unsigned int irq, unsigned long *flags) 131irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
123{ 132{
124 return __irq_get_desc_lock(irq, flags, true); 133 return __irq_get_desc_lock(irq, flags, true, check);
125} 134}
126 135
127static inline void 136static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
131} 140}
132 141
133static inline struct irq_desc * 142static inline struct irq_desc *
134irq_get_desc_lock(unsigned int irq, unsigned long *flags) 143irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
135{ 144{
136 return __irq_get_desc_lock(irq, flags, false); 145 return __irq_get_desc_lock(irq, flags, false, check);
137} 146}
138 147
139static inline void 148static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 039b889ea053..d86e254b95eb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
9 */ 9 */
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
@@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
424} 424}
425 425
426struct irq_desc * 426struct irq_desc *
427__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) 427__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
428 unsigned int check)
428{ 429{
429 struct irq_desc *desc = irq_to_desc(irq); 430 struct irq_desc *desc = irq_to_desc(irq);
430 431
431 if (desc) { 432 if (desc) {
433 if (check & _IRQ_DESC_CHECK) {
434 if ((check & _IRQ_DESC_PERCPU) &&
435 !irq_settings_is_per_cpu_devid(desc))
436 return NULL;
437
438 if (!(check & _IRQ_DESC_PERCPU) &&
439 irq_settings_is_per_cpu_devid(desc))
440 return NULL;
441 }
442
432 if (bus) 443 if (bus)
433 chip_bus_lock(desc); 444 chip_bus_lock(desc);
434 raw_spin_lock_irqsave(&desc->lock, *flags); 445 raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
443 chip_bus_sync_unlock(desc); 454 chip_bus_sync_unlock(desc);
444} 455}
445 456
457int irq_set_percpu_devid(unsigned int irq)
458{
459 struct irq_desc *desc = irq_to_desc(irq);
460
461 if (!desc)
462 return -EINVAL;
463
464 if (desc->percpu_enabled)
465 return -EINVAL;
466
467 desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
468
469 if (!desc->percpu_enabled)
470 return -ENOMEM;
471
472 irq_set_percpu_devid_flags(irq);
473 return 0;
474}
475
446/** 476/**
447 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 477 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
448 * @irq: irq number to initialize 478 * @irq: irq number to initialize
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index b57a3776de44..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex);
20void irq_domain_add(struct irq_domain *domain) 20void irq_domain_add(struct irq_domain *domain)
21{ 21{
22 struct irq_data *d; 22 struct irq_data *d;
23 int hwirq; 23 int hwirq, irq;
24 24
25 /* 25 /*
26 * This assumes that the irq_domain owner has already allocated 26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic 27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain. 28 * allocation of irq_descs is added to irq_domain.
29 */ 29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { 30 irq_domain_for_each_irq(domain, hwirq, irq) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); 31 d = irq_get_irq_data(irq);
32 if (!d) { 32 if (!d) {
33 WARN(1, "error: assigning domain to non existant irq_desc"); 33 WARN(1, "error: assigning domain to non existant irq_desc");
34 return; 34 return;
@@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain)
54void irq_domain_del(struct irq_domain *domain) 54void irq_domain_del(struct irq_domain *domain)
55{ 55{
56 struct irq_data *d; 56 struct irq_data *d;
57 int hwirq; 57 int hwirq, irq;
58 58
59 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list); 60 list_del(&domain->list);
61 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
62 62
63 /* Clear the irq_domain assignments */ 63 /* Clear the irq_domain assignments */
64 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { 64 irq_domain_for_each_irq(domain, hwirq, irq) {
65 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); 65 d = irq_get_irq_data(irq);
66 d->domain = NULL; 66 d->domain = NULL;
67 } 67 }
68} 68}
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
135 return -EINVAL; 135 return -EINVAL;
136 if (intsize < 1) 136 if (intsize < 1)
137 return -EINVAL; 137 return -EINVAL;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
139 (intspec[0] >= d->hwirq_base + d->nr_irq)))
140 return -EINVAL;
138 141
139 *out_hwirq = intspec[0]; 142 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE; 143 *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
143 return 0; 146 return 0;
144} 147}
145 148
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/** 149/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range 150 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */ 151 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
182} 180}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 181EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */ 182#endif /* CONFIG_OF_IRQ */
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa20308..a9a9dbe49fea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
196{ 196{
197 unsigned long flags; 197 unsigned long flags;
198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
199 199
200 if (!desc) 200 if (!desc)
201 return -EINVAL; 201 return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
356static int __disable_irq_nosync(unsigned int irq) 356static int __disable_irq_nosync(unsigned int irq)
357{ 357{
358 unsigned long flags; 358 unsigned long flags;
359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
360 360
361 if (!desc) 361 if (!desc)
362 return -EINVAL; 362 return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
448void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
449{ 449{
450 unsigned long flags; 450 unsigned long flags;
451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
452 452
453 if (!desc) 453 if (!desc)
454 return; 454 return;
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
467 struct irq_desc *desc = irq_to_desc(irq); 467 struct irq_desc *desc = irq_to_desc(irq);
468 int ret = -ENXIO; 468 int ret = -ENXIO;
469 469
470 if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE)
471 return 0;
472
470 if (desc->irq_data.chip->irq_set_wake) 473 if (desc->irq_data.chip->irq_set_wake)
471 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); 474 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
472 475
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
488int irq_set_irq_wake(unsigned int irq, unsigned int on) 491int irq_set_irq_wake(unsigned int irq, unsigned int on)
489{ 492{
490 unsigned long flags; 493 unsigned long flags;
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 494 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
492 int ret = 0; 495 int ret = 0;
493 496
494 if (!desc) 497 if (!desc)
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
529int can_request_irq(unsigned int irq, unsigned long irqflags) 532int can_request_irq(unsigned int irq, unsigned long irqflags)
530{ 533{
531 unsigned long flags; 534 unsigned long flags;
532 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 535 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
533 int canrequest = 0; 536 int canrequest = 0;
534 537
535 if (!desc) 538 if (!desc)
@@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
620 623
621static int irq_wait_for_interrupt(struct irqaction *action) 624static int irq_wait_for_interrupt(struct irqaction *action)
622{ 625{
626 set_current_state(TASK_INTERRUPTIBLE);
627
623 while (!kthread_should_stop()) { 628 while (!kthread_should_stop()) {
624 set_current_state(TASK_INTERRUPTIBLE);
625 629
626 if (test_and_clear_bit(IRQTF_RUNTHREAD, 630 if (test_and_clear_bit(IRQTF_RUNTHREAD,
627 &action->thread_flags)) { 631 &action->thread_flags)) {
@@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
629 return 0; 633 return 0;
630 } 634 }
631 schedule(); 635 schedule();
636 set_current_state(TASK_INTERRUPTIBLE);
632 } 637 }
638 __set_current_state(TASK_RUNNING);
633 return -1; 639 return -1;
634} 640}
635 641
@@ -1118,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1118 int retval; 1124 int retval;
1119 struct irq_desc *desc = irq_to_desc(irq); 1125 struct irq_desc *desc = irq_to_desc(irq);
1120 1126
1127 if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1128 return -EINVAL;
1121 chip_bus_lock(desc); 1129 chip_bus_lock(desc);
1122 retval = __setup_irq(irq, desc, act); 1130 retval = __setup_irq(irq, desc, act);
1123 chip_bus_sync_unlock(desc); 1131 chip_bus_sync_unlock(desc);
@@ -1126,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1126} 1134}
1127EXPORT_SYMBOL_GPL(setup_irq); 1135EXPORT_SYMBOL_GPL(setup_irq);
1128 1136
1129 /* 1137/*
1130 * Internal function to unregister an irqaction - used to free 1138 * Internal function to unregister an irqaction - used to free
1131 * regular and special interrupts that are part of the architecture. 1139 * regular and special interrupts that are part of the architecture.
1132 */ 1140 */
@@ -1224,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1224 */ 1232 */
1225void remove_irq(unsigned int irq, struct irqaction *act) 1233void remove_irq(unsigned int irq, struct irqaction *act)
1226{ 1234{
1227 __free_irq(irq, act->dev_id); 1235 struct irq_desc *desc = irq_to_desc(irq);
1236
1237 if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1238 __free_irq(irq, act->dev_id);
1228} 1239}
1229EXPORT_SYMBOL_GPL(remove_irq); 1240EXPORT_SYMBOL_GPL(remove_irq);
1230 1241
@@ -1246,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id)
1246{ 1257{
1247 struct irq_desc *desc = irq_to_desc(irq); 1258 struct irq_desc *desc = irq_to_desc(irq);
1248 1259
1249 if (!desc) 1260 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1250 return; 1261 return;
1251 1262
1252#ifdef CONFIG_SMP 1263#ifdef CONFIG_SMP
@@ -1281,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
1281 * and to set up the interrupt handler in the right order. 1292 * and to set up the interrupt handler in the right order.
1282 * 1293 *
1283 * If you want to set up a threaded irq handler for your device 1294 * If you want to set up a threaded irq handler for your device
1284 * then you need to supply @handler and @thread_fn. @handler ist 1295 * then you need to supply @handler and @thread_fn. @handler is
1285 * still called in hard interrupt context and has to check 1296 * still called in hard interrupt context and has to check
1286 * whether the interrupt originates from the device. If yes it 1297 * whether the interrupt originates from the device. If yes it
1287 * needs to disable the interrupt on the device and return 1298 * needs to disable the interrupt on the device and return
@@ -1324,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1324 if (!desc) 1335 if (!desc)
1325 return -EINVAL; 1336 return -EINVAL;
1326 1337
1327 if (!irq_settings_can_request(desc)) 1338 if (!irq_settings_can_request(desc) ||
1339 WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1328 return -EINVAL; 1340 return -EINVAL;
1329 1341
1330 if (!handler) { 1342 if (!handler) {
@@ -1409,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1409 return !ret ? IRQC_IS_HARDIRQ : ret; 1421 return !ret ? IRQC_IS_HARDIRQ : ret;
1410} 1422}
1411EXPORT_SYMBOL_GPL(request_any_context_irq); 1423EXPORT_SYMBOL_GPL(request_any_context_irq);
1424
1425void enable_percpu_irq(unsigned int irq, unsigned int type)
1426{
1427 unsigned int cpu = smp_processor_id();
1428 unsigned long flags;
1429 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1430
1431 if (!desc)
1432 return;
1433
1434 type &= IRQ_TYPE_SENSE_MASK;
1435 if (type != IRQ_TYPE_NONE) {
1436 int ret;
1437
1438 ret = __irq_set_trigger(desc, irq, type);
1439
1440 if (ret) {
1441 WARN(1, "failed to set type for IRQ%d\n", irq);
1442 goto out;
1443 }
1444 }
1445
1446 irq_percpu_enable(desc, cpu);
1447out:
1448 irq_put_desc_unlock(desc, flags);
1449}
1450
1451void disable_percpu_irq(unsigned int irq)
1452{
1453 unsigned int cpu = smp_processor_id();
1454 unsigned long flags;
1455 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1456
1457 if (!desc)
1458 return;
1459
1460 irq_percpu_disable(desc, cpu);
1461 irq_put_desc_unlock(desc, flags);
1462}
1463
1464/*
1465 * Internal function to unregister a percpu irqaction.
1466 */
1467static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1468{
1469 struct irq_desc *desc = irq_to_desc(irq);
1470 struct irqaction *action;
1471 unsigned long flags;
1472
1473 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
1474
1475 if (!desc)
1476 return NULL;
1477
1478 raw_spin_lock_irqsave(&desc->lock, flags);
1479
1480 action = desc->action;
1481 if (!action || action->percpu_dev_id != dev_id) {
1482 WARN(1, "Trying to free already-free IRQ %d\n", irq);
1483 goto bad;
1484 }
1485
1486 if (!cpumask_empty(desc->percpu_enabled)) {
1487 WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
1488 irq, cpumask_first(desc->percpu_enabled));
1489 goto bad;
1490 }
1491
1492 /* Found it - now remove it from the list of entries: */
1493 desc->action = NULL;
1494
1495 raw_spin_unlock_irqrestore(&desc->lock, flags);
1496
1497 unregister_handler_proc(irq, action);
1498
1499 module_put(desc->owner);
1500 return action;
1501
1502bad:
1503 raw_spin_unlock_irqrestore(&desc->lock, flags);
1504 return NULL;
1505}
1506
1507/**
1508 * remove_percpu_irq - free a per-cpu interrupt
1509 * @irq: Interrupt line to free
1510 * @act: irqaction for the interrupt
1511 *
1512 * Used to remove interrupts statically setup by the early boot process.
1513 */
1514void remove_percpu_irq(unsigned int irq, struct irqaction *act)
1515{
1516 struct irq_desc *desc = irq_to_desc(irq);
1517
1518 if (desc && irq_settings_is_per_cpu_devid(desc))
1519 __free_percpu_irq(irq, act->percpu_dev_id);
1520}
1521
1522/**
1523 * free_percpu_irq - free an interrupt allocated with request_percpu_irq
1524 * @irq: Interrupt line to free
1525 * @dev_id: Device identity to free
1526 *
1527 * Remove a percpu interrupt handler. The handler is removed, but
1528 * the interrupt line is not disabled. This must be done on each
1529 * CPU before calling this function. The function does not return
1530 * until any executing interrupts for this IRQ have completed.
1531 *
1532 * This function must not be called from interrupt context.
1533 */
1534void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1535{
1536 struct irq_desc *desc = irq_to_desc(irq);
1537
1538 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1539 return;
1540
1541 chip_bus_lock(desc);
1542 kfree(__free_percpu_irq(irq, dev_id));
1543 chip_bus_sync_unlock(desc);
1544}
1545
1546/**
1547 * setup_percpu_irq - setup a per-cpu interrupt
1548 * @irq: Interrupt line to setup
1549 * @act: irqaction for the interrupt
1550 *
1551 * Used to statically setup per-cpu interrupts in the early boot process.
1552 */
1553int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1554{
1555 struct irq_desc *desc = irq_to_desc(irq);
1556 int retval;
1557
1558 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1559 return -EINVAL;
1560 chip_bus_lock(desc);
1561 retval = __setup_irq(irq, desc, act);
1562 chip_bus_sync_unlock(desc);
1563
1564 return retval;
1565}
1566
1567/**
1568 * request_percpu_irq - allocate a percpu interrupt line
1569 * @irq: Interrupt line to allocate
1570 * @handler: Function to be called when the IRQ occurs.
1571 * @devname: An ascii name for the claiming device
1572 * @dev_id: A percpu cookie passed back to the handler function
1573 *
1574 * This call allocates interrupt resources, but doesn't
1575 * automatically enable the interrupt. It has to be done on each
1576 * CPU using enable_percpu_irq().
1577 *
1578 * Dev_id must be globally unique. It is a per-cpu variable, and
1579 * the handler gets called with the interrupted CPU's instance of
1580 * that variable.
1581 */
1582int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1583 const char *devname, void __percpu *dev_id)
1584{
1585 struct irqaction *action;
1586 struct irq_desc *desc;
1587 int retval;
1588
1589 if (!dev_id)
1590 return -EINVAL;
1591
1592 desc = irq_to_desc(irq);
1593 if (!desc || !irq_settings_can_request(desc) ||
1594 !irq_settings_is_per_cpu_devid(desc))
1595 return -EINVAL;
1596
1597 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1598 if (!action)
1599 return -ENOMEM;
1600
1601 action->handler = handler;
1602 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
1603 action->name = devname;
1604 action->percpu_dev_id = dev_id;
1605
1606 chip_bus_lock(desc);
1607 retval = __setup_irq(irq, desc, action);
1608 chip_bus_sync_unlock(desc);
1609
1610 if (retval)
1611 kfree(action);
1612
1613 return retval;
1614}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c9877..15e53b1766a6 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/syscore_ops.h>
12 13
13#include "internals.h" 14#include "internals.h"
14 15
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
39} 40}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 41EXPORT_SYMBOL_GPL(suspend_device_irqs);
41 42
42/** 43static void resume_irqs(bool want_early)
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQS_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{ 44{
50 struct irq_desc *desc; 45 struct irq_desc *desc;
51 int irq; 46 int irq;
52 47
53 for_each_irq_desc(irq, desc) { 48 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 49 unsigned long flags;
50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME;
52
53 if (is_early != want_early)
54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 59 }
60} 60}
61
62/**
63 * irq_pm_syscore_ops - enable interrupt lines early
64 *
65 * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
66 */
67static void irq_pm_syscore_resume(void)
68{
69 resume_irqs(true);
70}
71
72static struct syscore_ops irq_pm_syscore_ops = {
73 .resume = irq_pm_syscore_resume,
74};
75
76static int __init irq_pm_init_ops(void)
77{
78 register_syscore_ops(&irq_pm_syscore_ops);
79 return 0;
80}
81
82device_initcall(irq_pm_init_ops);
83
84/**
85 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
86 *
87 * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
88 * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
89 * set as well as those with %IRQF_FORCE_RESUME.
90 */
91void resume_device_irqs(void)
92{
93 resume_irqs(false);
94}
61EXPORT_SYMBOL_GPL(resume_device_irqs); 95EXPORT_SYMBOL_GPL(resume_device_irqs);
62 96
63/** 97/**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d444..1162f1030f18 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
16 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 17 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17}; 18};
18 19
@@ -24,6 +25,7 @@ enum {
24#define IRQ_NOTHREAD GOT_YOU_MORON 25#define IRQ_NOTHREAD GOT_YOU_MORON
25#define IRQ_NOAUTOEN GOT_YOU_MORON 26#define IRQ_NOAUTOEN GOT_YOU_MORON
26#define IRQ_NESTED_THREAD GOT_YOU_MORON 27#define IRQ_NESTED_THREAD GOT_YOU_MORON
28#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
27#undef IRQF_MODIFY_MASK 29#undef IRQF_MODIFY_MASK
28#define IRQF_MODIFY_MASK GOT_YOU_MORON 30#define IRQF_MODIFY_MASK GOT_YOU_MORON
29 31
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
39 return desc->status_use_accessors & _IRQ_PER_CPU; 41 return desc->status_use_accessors & _IRQ_PER_CPU;
40} 42}
41 43
44static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
45{
46 return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
47}
48
42static inline void irq_settings_set_per_cpu(struct irq_desc *desc) 49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
43{ 50{
44 desc->status_use_accessors |= _IRQ_PER_CPU; 51 desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c1..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next) 87 (action->flags & __IRQF_TIMER) ||
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
88 goto out; 90 goto out;
89 91
90 /* Already running on another processor */ 92 /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
115 struct irq_desc *desc; 117 struct irq_desc *desc;
116 int i, ok = 0; 118 int i, ok = 0;
117 119
118 if (atomic_inc_return(&irq_poll_active) == 1) 120 if (atomic_inc_return(&irq_poll_active) != 1)
119 goto out; 121 goto out;
120 122
121 irq_poll_cpu = smp_processor_id(); 123 irq_poll_cpu = smp_processor_id();
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
323 desc->irqs_unhandled = 0; 325 desc->irqs_unhandled = 0;
324} 326}
325 327
326int noirqdebug __read_mostly; 328bool noirqdebug __read_mostly;
327 329
328int noirqdebug_setup(char *str) 330int noirqdebug_setup(char *str)
329{ 331{
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..c3c46c72046e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -6,9 +6,11 @@
6 */ 6 */
7 7
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/irq_work.h> 10#include <linux/irq_work.h>
11#include <linux/percpu.h>
11#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <asm/processor.h>
12 14
13/* 15/*
14 * An entry can be in one of four states: 16 * An entry can be in one of four states:
@@ -17,54 +19,34 @@
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued 19 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback 20 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 21 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */ 22 */
24 23
25#define IRQ_WORK_PENDING 1UL 24#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL 25#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL 26#define IRQ_WORK_FLAGS 3UL
28 27
29static inline bool irq_work_is_set(struct irq_work *entry, int flags) 28static DEFINE_PER_CPU(struct llist_head, irq_work_list);
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49 29
50/* 30/*
51 * Claim the entry so that no one else will poke at it. 31 * Claim the entry so that no one else will poke at it.
52 */ 32 */
53static bool irq_work_claim(struct irq_work *entry) 33static bool irq_work_claim(struct irq_work *work)
54{ 34{
55 struct irq_work *next, *nflags; 35 unsigned long flags, nflags;
56 36
57 do { 37 for (;;) {
58 next = entry->next; 38 flags = work->flags;
59 if ((unsigned long)next & IRQ_WORK_PENDING) 39 if (flags & IRQ_WORK_PENDING)
60 return false; 40 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS); 41 nflags = flags | IRQ_WORK_FLAGS;
62 } while (cmpxchg(&entry->next, next, nflags) != next); 42 if (cmpxchg(&work->flags, flags, nflags) == flags)
43 break;
44 cpu_relax();
45 }
63 46
64 return true; 47 return true;
65} 48}
66 49
67
68void __weak arch_irq_work_raise(void) 50void __weak arch_irq_work_raise(void)
69{ 51{
70 /* 52 /*
@@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void)
75/* 57/*
76 * Queue the entry and raise the IPI if needed. 58 * Queue the entry and raise the IPI if needed.
77 */ 59 */
78static void __irq_work_queue(struct irq_work *entry) 60static void __irq_work_queue(struct irq_work *work)
79{ 61{
80 struct irq_work *next; 62 bool empty;
81 63
82 preempt_disable(); 64 preempt_disable();
83 65
84 do { 66 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */ 67 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 68 if (empty)
92 arch_irq_work_raise(); 69 arch_irq_work_raise();
93 70
94 preempt_enable(); 71 preempt_enable();
@@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry)
100 * 77 *
101 * Can be re-enqueued while the callback is still in progress. 78 * Can be re-enqueued while the callback is still in progress.
102 */ 79 */
103bool irq_work_queue(struct irq_work *entry) 80bool irq_work_queue(struct irq_work *work)
104{ 81{
105 if (!irq_work_claim(entry)) { 82 if (!irq_work_claim(work)) {
106 /* 83 /*
107 * Already enqueued, can't do! 84 * Already enqueued, can't do!
108 */ 85 */
109 return false; 86 return false;
110 } 87 }
111 88
112 __irq_work_queue(entry); 89 __irq_work_queue(work);
113 return true; 90 return true;
114} 91}
115EXPORT_SYMBOL_GPL(irq_work_queue); 92EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 97 */
121void irq_work_run(void) 98void irq_work_run(void)
122{ 99{
123 struct irq_work *list; 100 struct irq_work *work;
101 struct llist_head *this_list;
102 struct llist_node *llnode;
124 103
125 if (this_cpu_read(irq_work_list) == NULL) 104 this_list = &__get_cpu_var(irq_work_list);
105 if (llist_empty(this_list))
126 return; 106 return;
127 107
128 BUG_ON(!in_irq()); 108 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled()); 109 BUG_ON(!irqs_disabled());
130 110
131 list = this_cpu_xchg(irq_work_list, NULL); 111 llnode = llist_del_all(this_list);
132 112 while (llnode != NULL) {
133 while (list != NULL) { 113 work = llist_entry(llnode, struct irq_work, llnode);
134 struct irq_work *entry = list;
135 114
136 list = irq_work_next(list); 115 llnode = llist_next(llnode);
137 116
138 /* 117 /*
139 * Clear the PENDING bit, after this point the @entry 118 * Clear the PENDING bit, after this point the @work
140 * can be re-used. 119 * can be re-used.
141 */ 120 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY); 121 work->flags = IRQ_WORK_BUSY;
143 entry->func(entry); 122 work->func(work);
144 /* 123 /*
145 * Clear the BUSY bit and return to the free state if 124 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 125 * no-one else claimed it meanwhile.
147 */ 126 */
148 (void)cmpxchg(&entry->next, 127 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 } 128 }
152} 129}
153EXPORT_SYMBOL_GPL(irq_work_run); 130EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
156 * Synchronize against the irq_work @entry, ensures the entry is not 133 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use. 134 * currently in use.
158 */ 135 */
159void irq_work_sync(struct irq_work *entry) 136void irq_work_sync(struct irq_work *work)
160{ 137{
161 WARN_ON_ONCE(irqs_disabled()); 138 WARN_ON_ONCE(irqs_disabled());
162 139
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 140 while (work->flags & IRQ_WORK_BUSY)
164 cpu_relax(); 141 cpu_relax();
165} 142}
166EXPORT_SYMBOL_GPL(irq_work_sync); 143EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3d..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key)
66 return; 66 return;
67 67
68 jump_label_lock(); 68 jump_label_lock();
69 if (atomic_add_return(1, &key->enabled) == 1) 69 if (atomic_read(&key->enabled) == 0)
70 jump_label_update(key, JUMP_LABEL_ENABLE); 70 jump_label_update(key, JUMP_LABEL_ENABLE);
71 atomic_inc(&key->enabled);
71 jump_label_unlock(); 72 jump_label_unlock();
72} 73}
74EXPORT_SYMBOL_GPL(jump_label_inc);
73 75
74void jump_label_dec(struct jump_label_key *key) 76static void __jump_label_dec(struct jump_label_key *key,
77 unsigned long rate_limit, struct delayed_work *work)
75{ 78{
76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
77 return; 80 return;
78 81
79 jump_label_update(key, JUMP_LABEL_DISABLE); 82 if (rate_limit) {
83 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit);
85 } else
86 jump_label_update(key, JUMP_LABEL_DISABLE);
87
80 jump_label_unlock(); 88 jump_label_unlock();
81} 89}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91
92static void jump_label_update_timeout(struct work_struct *work)
93{
94 struct jump_label_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL);
97}
98
99void jump_label_dec(struct jump_label_key *key)
100{
101 __jump_label_dec(key, 0, NULL);
102}
103
104void jump_label_dec_deferred(struct jump_label_key_deferred *key)
105{
106 __jump_label_dec(&key->key, key->timeout, &key->work);
107}
108
109
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl)
112{
113 key->timeout = rl;
114 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
115}
82 116
83static int addr_conflict(struct jump_entry *entry, void *start, void *end) 117static int addr_conflict(struct jump_entry *entry, void *start, void *end)
84{ 118{
@@ -104,6 +138,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
104 return 0; 138 return 0;
105} 139}
106 140
141/*
142 * Update code which is definitely not currently executing.
143 * Architectures which need heavyweight synchronization to modify
144 * running code can override this to make the non-live update case
145 * cheaper.
146 */
147void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
148 enum jump_label_type type)
149{
150 arch_jump_label_transform(entry, type);
151}
152
107static void __jump_label_update(struct jump_label_key *key, 153static void __jump_label_update(struct jump_label_key *key,
108 struct jump_entry *entry, 154 struct jump_entry *entry,
109 struct jump_entry *stop, int enable) 155 struct jump_entry *stop, int enable)
@@ -121,14 +167,7 @@ static void __jump_label_update(struct jump_label_key *key,
121 } 167 }
122} 168}
123 169
124/* 170void __init jump_label_init(void)
125 * Not all archs need this.
126 */
127void __weak arch_jump_label_text_poke_early(jump_label_t addr)
128{
129}
130
131static __init int jump_label_init(void)
132{ 171{
133 struct jump_entry *iter_start = __start___jump_table; 172 struct jump_entry *iter_start = __start___jump_table;
134 struct jump_entry *iter_stop = __stop___jump_table; 173 struct jump_entry *iter_stop = __stop___jump_table;
@@ -139,22 +178,22 @@ static __init int jump_label_init(void)
139 jump_label_sort_entries(iter_start, iter_stop); 178 jump_label_sort_entries(iter_start, iter_stop);
140 179
141 for (iter = iter_start; iter < iter_stop; iter++) { 180 for (iter = iter_start; iter < iter_stop; iter++) {
142 arch_jump_label_text_poke_early(iter->code); 181 struct jump_label_key *iterk;
143 if (iter->key == (jump_label_t)(unsigned long)key) 182
183 iterk = (struct jump_label_key *)(unsigned long)iter->key;
184 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
185 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
186 if (iterk == key)
144 continue; 187 continue;
145 188
146 key = (struct jump_label_key *)(unsigned long)iter->key; 189 key = iterk;
147 atomic_set(&key->enabled, 0);
148 key->entries = iter; 190 key->entries = iter;
149#ifdef CONFIG_MODULES 191#ifdef CONFIG_MODULES
150 key->next = NULL; 192 key->next = NULL;
151#endif 193#endif
152 } 194 }
153 jump_label_unlock(); 195 jump_label_unlock();
154
155 return 0;
156} 196}
157early_initcall(jump_label_init);
158 197
159#ifdef CONFIG_MODULES 198#ifdef CONFIG_MODULES
160 199
@@ -211,8 +250,13 @@ void jump_label_apply_nops(struct module *mod)
211 if (iter_start == iter_stop) 250 if (iter_start == iter_stop)
212 return; 251 return;
213 252
214 for (iter = iter_start; iter < iter_stop; iter++) 253 for (iter = iter_start; iter < iter_stop; iter++) {
215 arch_jump_label_text_poke_early(iter->code); 254 struct jump_label_key *iterk;
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 }
216} 260}
217 261
218static int jump_label_add_module(struct module *mod) 262static int jump_label_add_module(struct module *mod)
@@ -252,8 +296,7 @@ static int jump_label_add_module(struct module *mod)
252 key->next = jlm; 296 key->next = jlm;
253 297
254 if (jump_label_enabled(key)) 298 if (jump_label_enabled(key))
255 __jump_label_update(key, iter, iter_stop, 299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
256 JUMP_LABEL_ENABLE);
257 } 300 }
258 301
259 return 0; 302 return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 296fbc84d659..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h> 35#include <linux/syscore_ops.h>
37 36
38#include <asm/page.h> 37#include <asm/page.h>
@@ -498,7 +497,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
498 while (hole_end <= crashk_res.end) { 497 while (hole_end <= crashk_res.end) {
499 unsigned long i; 498 unsigned long i;
500 499
501 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 500 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
502 break; 501 break;
503 if (hole_end > crashk_res.end) 502 if (hole_end > crashk_res.end)
504 break; 503 break;
@@ -999,6 +998,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
999 kimage_free(xchg(&kexec_crash_image, NULL)); 998 kimage_free(xchg(&kexec_crash_image, NULL));
1000 result = kimage_crash_alloc(&image, entry, 999 result = kimage_crash_alloc(&image, entry,
1001 nr_segments, segments); 1000 nr_segments, segments);
1001 crash_map_reserved_pages();
1002 } 1002 }
1003 if (result) 1003 if (result)
1004 goto out; 1004 goto out;
@@ -1015,6 +1015,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1015 goto out; 1015 goto out;
1016 } 1016 }
1017 kimage_terminate(image); 1017 kimage_terminate(image);
1018 if (flags & KEXEC_ON_CRASH)
1019 crash_unmap_reserved_pages();
1018 } 1020 }
1019 /* Install the new kernel, and Uninstall the old */ 1021 /* Install the new kernel, and Uninstall the old */
1020 image = xchg(dest_image, image); 1022 image = xchg(dest_image, image);
@@ -1026,6 +1028,18 @@ out:
1026 return result; 1028 return result;
1027} 1029}
1028 1030
1031/*
1032 * Add and remove page tables for crashkernel memory
1033 *
1034 * Provide an empty default implementation here -- architecture
1035 * code may override this
1036 */
1037void __weak crash_map_reserved_pages(void)
1038{}
1039
1040void __weak crash_unmap_reserved_pages(void)
1041{}
1042
1029#ifdef CONFIG_COMPAT 1043#ifdef CONFIG_COMPAT
1030asmlinkage long compat_sys_kexec_load(unsigned long entry, 1044asmlinkage long compat_sys_kexec_load(unsigned long entry,
1031 unsigned long nr_segments, 1045 unsigned long nr_segments,
@@ -1079,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
1079 if (kexec_crash_image) { 1093 if (kexec_crash_image) {
1080 struct pt_regs fixed_regs; 1094 struct pt_regs fixed_regs;
1081 1095
1082 kmsg_dump(KMSG_DUMP_KEXEC);
1083
1084 crash_setup_regs(&fixed_regs, regs); 1096 crash_setup_regs(&fixed_regs, regs);
1085 crash_save_vmcoreinfo(); 1097 crash_save_vmcoreinfo();
1086 machine_crash_shutdown(&fixed_regs); 1098 machine_crash_shutdown(&fixed_regs);
@@ -1117,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
1117{ 1129{
1118 int ret = 0; 1130 int ret = 0;
1119 unsigned long start, end; 1131 unsigned long start, end;
1132 unsigned long old_size;
1133 struct resource *ram_res;
1120 1134
1121 mutex_lock(&kexec_mutex); 1135 mutex_lock(&kexec_mutex);
1122 1136
@@ -1126,23 +1140,37 @@ int crash_shrink_memory(unsigned long new_size)
1126 } 1140 }
1127 start = crashk_res.start; 1141 start = crashk_res.start;
1128 end = crashk_res.end; 1142 end = crashk_res.end;
1143 old_size = (end == 0) ? 0 : end - start + 1;
1144 if (new_size >= old_size) {
1145 ret = (new_size == old_size) ? 0 : -EINVAL;
1146 goto unlock;
1147 }
1129 1148
1130 if (new_size >= end - start + 1) { 1149 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1131 ret = -EINVAL; 1150 if (!ram_res) {
1132 if (new_size == end - start + 1) 1151 ret = -ENOMEM;
1133 ret = 0;
1134 goto unlock; 1152 goto unlock;
1135 } 1153 }
1136 1154
1137 start = roundup(start, PAGE_SIZE); 1155 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1138 end = roundup(start + new_size, PAGE_SIZE); 1156 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1139 1157
1158 crash_map_reserved_pages();
1140 crash_free_reserved_phys_range(end, crashk_res.end); 1159 crash_free_reserved_phys_range(end, crashk_res.end);
1141 1160
1142 if ((start == end) && (crashk_res.parent != NULL)) 1161 if ((start == end) && (crashk_res.parent != NULL))
1143 release_resource(&crashk_res); 1162 release_resource(&crashk_res);
1163
1164 ram_res->start = end;
1165 ram_res->end = crashk_res.end;
1166 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167 ram_res->name = "System RAM";
1168
1144 crashk_res.end = end - 1; 1169 crashk_res.end = end - 1;
1145 1170
1171 insert_resource(&iomem_resource, ram_res);
1172 crash_unmap_reserved_pages();
1173
1146unlock: 1174unlock:
1147 mutex_unlock(&kexec_mutex); 1175 mutex_unlock(&kexec_mutex);
1148 return ret; 1176 return ret;
@@ -1380,24 +1408,23 @@ int __init parse_crashkernel(char *cmdline,
1380} 1408}
1381 1409
1382 1410
1383 1411static void update_vmcoreinfo_note(void)
1384void crash_save_vmcoreinfo(void)
1385{ 1412{
1386 u32 *buf; 1413 u32 *buf = vmcoreinfo_note;
1387 1414
1388 if (!vmcoreinfo_size) 1415 if (!vmcoreinfo_size)
1389 return; 1416 return;
1390
1391 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1392
1393 buf = (u32 *)vmcoreinfo_note;
1394
1395 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1417 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1396 vmcoreinfo_size); 1418 vmcoreinfo_size);
1397
1398 final_note(buf); 1419 final_note(buf);
1399} 1420}
1400 1421
1422void crash_save_vmcoreinfo(void)
1423{
1424 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1425 update_vmcoreinfo_note();
1426}
1427
1401void vmcoreinfo_append_str(const char *fmt, ...) 1428void vmcoreinfo_append_str(const char *fmt, ...)
1402{ 1429{
1403 va_list args; 1430 va_list args;
@@ -1483,6 +1510,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1483 VMCOREINFO_NUMBER(PG_swapcache); 1510 VMCOREINFO_NUMBER(PG_swapcache);
1484 1511
1485 arch_crash_save_vmcoreinfo(); 1512 arch_crash_save_vmcoreinfo();
1513 update_vmcoreinfo_note();
1486 1514
1487 return 0; 1515 return 0;
1488} 1516}
@@ -1506,7 +1534,7 @@ int kernel_kexec(void)
1506 1534
1507#ifdef CONFIG_KEXEC_JUMP 1535#ifdef CONFIG_KEXEC_JUMP
1508 if (kexec_image->preserve_context) { 1536 if (kexec_image->preserve_context) {
1509 mutex_lock(&pm_mutex); 1537 lock_system_sleep();
1510 pm_prepare_console(); 1538 pm_prepare_console();
1511 error = freeze_processes(); 1539 error = freeze_processes();
1512 if (error) { 1540 if (error) {
@@ -1559,7 +1587,7 @@ int kernel_kexec(void)
1559 thaw_processes(); 1587 thaw_processes();
1560 Restore_console: 1588 Restore_console:
1561 pm_restore_console(); 1589 pm_restore_console();
1562 mutex_unlock(&pm_mutex); 1590 unlock_system_sleep();
1563 } 1591 }
1564#endif 1592#endif
1565 1593
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 01a0700e873f..c744b88c44e2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/module.h> 23#include <linux/export.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/log2.h> 26#include <linux/log2.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c1305..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <trace/events/module.h> 42#include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; 51static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 52static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock); 53static DEFINE_SPINLOCK(umh_sysctl_lock);
54static DECLARE_RWSEM(umhelper_sem);
53 55
54#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
55 57
@@ -114,10 +116,12 @@ int __request_module(bool wait, const char *fmt, ...)
114 atomic_inc(&kmod_concurrent); 116 atomic_inc(&kmod_concurrent);
115 if (atomic_read(&kmod_concurrent) > max_modprobes) { 117 if (atomic_read(&kmod_concurrent) > max_modprobes) {
116 /* We may be blaming an innocent here, but unlikely */ 118 /* We may be blaming an innocent here, but unlikely */
117 if (kmod_loop_msg++ < 5) 119 if (kmod_loop_msg < 5) {
118 printk(KERN_ERR 120 printk(KERN_ERR
119 "request_module: runaway loop modprobe %s\n", 121 "request_module: runaway loop modprobe %s\n",
120 module_name); 122 module_name);
123 kmod_loop_msg++;
124 }
121 atomic_dec(&kmod_concurrent); 125 atomic_dec(&kmod_concurrent);
122 return -ENOMEM; 126 return -ENOMEM;
123 } 127 }
@@ -273,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
273 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 277 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
274 * (used for preventing user land processes from being created after the user 278 * (used for preventing user land processes from being created after the user
275 * land has been frozen during a system-wide hibernation or suspend operation). 279 * land has been frozen during a system-wide hibernation or suspend operation).
280 * Should always be manipulated under umhelper_sem acquired for write.
276 */ 281 */
277static int usermodehelper_disabled = 1; 282static int usermodehelper_disabled = 1;
278 283
@@ -280,17 +285,29 @@ static int usermodehelper_disabled = 1;
280static atomic_t running_helpers = ATOMIC_INIT(0); 285static atomic_t running_helpers = ATOMIC_INIT(0);
281 286
282/* 287/*
283 * Wait queue head used by usermodehelper_pm_callback() to wait for all running 288 * Wait queue head used by usermodehelper_disable() to wait for all running
284 * helpers to finish. 289 * helpers to finish.
285 */ 290 */
286static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 291static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
287 292
288/* 293/*
289 * Time to wait for running_helpers to become zero before the setting of 294 * Time to wait for running_helpers to become zero before the setting of
290 * usermodehelper_disabled in usermodehelper_pm_callback() fails 295 * usermodehelper_disabled in usermodehelper_disable() fails
291 */ 296 */
292#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 297#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
293 298
299void read_lock_usermodehelper(void)
300{
301 down_read(&umhelper_sem);
302}
303EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
304
305void read_unlock_usermodehelper(void)
306{
307 up_read(&umhelper_sem);
308}
309EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
310
294/** 311/**
295 * usermodehelper_disable - prevent new helpers from being started 312 * usermodehelper_disable - prevent new helpers from being started
296 */ 313 */
@@ -298,8 +315,10 @@ int usermodehelper_disable(void)
298{ 315{
299 long retval; 316 long retval;
300 317
318 down_write(&umhelper_sem);
301 usermodehelper_disabled = 1; 319 usermodehelper_disabled = 1;
302 smp_mb(); 320 up_write(&umhelper_sem);
321
303 /* 322 /*
304 * From now on call_usermodehelper_exec() won't start any new 323 * From now on call_usermodehelper_exec() won't start any new
305 * helpers, so it is sufficient if running_helpers turns out to 324 * helpers, so it is sufficient if running_helpers turns out to
@@ -312,7 +331,9 @@ int usermodehelper_disable(void)
312 if (retval) 331 if (retval)
313 return 0; 332 return 0;
314 333
334 down_write(&umhelper_sem);
315 usermodehelper_disabled = 0; 335 usermodehelper_disabled = 0;
336 up_write(&umhelper_sem);
316 return -EAGAIN; 337 return -EAGAIN;
317} 338}
318 339
@@ -321,7 +342,9 @@ int usermodehelper_disable(void)
321 */ 342 */
322void usermodehelper_enable(void) 343void usermodehelper_enable(void)
323{ 344{
345 down_write(&umhelper_sem);
324 usermodehelper_disabled = 0; 346 usermodehelper_disabled = 0;
347 up_write(&umhelper_sem);
325} 348}
326 349
327/** 350/**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb985..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/stddef.h> 38#include <linux/stddef.h>
39#include <linux/module.h> 39#include <linux/export.h>
40#include <linux/moduleloader.h> 40#include <linux/moduleloader.h>
41#include <linux/kallsyms.h> 41#include <linux/kallsyms.h>
42#include <linux/freezer.h> 42#include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
78static DEFINE_MUTEX(kprobe_mutex); 78static DEFINE_MUTEX(kprobe_mutex);
79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
80static struct { 80static struct {
81 spinlock_t lock ____cacheline_aligned_in_smp; 81 raw_spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 82} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 83
84static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 85{
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1013 hlist_del(&ri->hlist); 1013 hlist_del(&ri->hlist);
1014 INIT_HLIST_NODE(&ri->hlist); 1014 INIT_HLIST_NODE(&ri->hlist);
1015 if (likely(rp)) { 1015 if (likely(rp)) {
1016 spin_lock(&rp->lock); 1016 raw_spin_lock(&rp->lock);
1017 hlist_add_head(&ri->hlist, &rp->free_instances); 1017 hlist_add_head(&ri->hlist, &rp->free_instances);
1018 spin_unlock(&rp->lock); 1018 raw_spin_unlock(&rp->lock);
1019 } else 1019 } else
1020 /* Unregistering */ 1020 /* Unregistering */
1021 hlist_add_head(&ri->hlist, head); 1021 hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
1026__acquires(hlist_lock) 1026__acquires(hlist_lock)
1027{ 1027{
1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1029 spinlock_t *hlist_lock; 1029 raw_spinlock_t *hlist_lock;
1030 1030
1031 *head = &kretprobe_inst_table[hash]; 1031 *head = &kretprobe_inst_table[hash];
1032 hlist_lock = kretprobe_table_lock_ptr(hash); 1032 hlist_lock = kretprobe_table_lock_ptr(hash);
1033 spin_lock_irqsave(hlist_lock, *flags); 1033 raw_spin_lock_irqsave(hlist_lock, *flags);
1034} 1034}
1035 1035
1036static void __kprobes kretprobe_table_lock(unsigned long hash, 1036static void __kprobes kretprobe_table_lock(unsigned long hash,
1037 unsigned long *flags) 1037 unsigned long *flags)
1038__acquires(hlist_lock) 1038__acquires(hlist_lock)
1039{ 1039{
1040 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1040 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1041 spin_lock_irqsave(hlist_lock, *flags); 1041 raw_spin_lock_irqsave(hlist_lock, *flags);
1042} 1042}
1043 1043
1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
1046__releases(hlist_lock) 1046__releases(hlist_lock)
1047{ 1047{
1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1049 spinlock_t *hlist_lock; 1049 raw_spinlock_t *hlist_lock;
1050 1050
1051 hlist_lock = kretprobe_table_lock_ptr(hash); 1051 hlist_lock = kretprobe_table_lock_ptr(hash);
1052 spin_unlock_irqrestore(hlist_lock, *flags); 1052 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1053} 1053}
1054 1054
1055static void __kprobes kretprobe_table_unlock(unsigned long hash, 1055static void __kprobes kretprobe_table_unlock(unsigned long hash,
1056 unsigned long *flags) 1056 unsigned long *flags)
1057__releases(hlist_lock) 1057__releases(hlist_lock)
1058{ 1058{
1059 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1059 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1060 spin_unlock_irqrestore(hlist_lock, *flags); 1060 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1061} 1061}
1062 1062
1063/* 1063/*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1663 1663
1664 /*TODO: consider to only swap the RA after the last pre_handler fired */ 1664 /*TODO: consider to only swap the RA after the last pre_handler fired */
1665 hash = hash_ptr(current, KPROBE_HASH_BITS); 1665 hash = hash_ptr(current, KPROBE_HASH_BITS);
1666 spin_lock_irqsave(&rp->lock, flags); 1666 raw_spin_lock_irqsave(&rp->lock, flags);
1667 if (!hlist_empty(&rp->free_instances)) { 1667 if (!hlist_empty(&rp->free_instances)) {
1668 ri = hlist_entry(rp->free_instances.first, 1668 ri = hlist_entry(rp->free_instances.first,
1669 struct kretprobe_instance, hlist); 1669 struct kretprobe_instance, hlist);
1670 hlist_del(&ri->hlist); 1670 hlist_del(&ri->hlist);
1671 spin_unlock_irqrestore(&rp->lock, flags); 1671 raw_spin_unlock_irqrestore(&rp->lock, flags);
1672 1672
1673 ri->rp = rp; 1673 ri->rp = rp;
1674 ri->task = current; 1674 ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1685 kretprobe_table_unlock(hash, &flags); 1685 kretprobe_table_unlock(hash, &flags);
1686 } else { 1686 } else {
1687 rp->nmissed++; 1687 rp->nmissed++;
1688 spin_unlock_irqrestore(&rp->lock, flags); 1688 raw_spin_unlock_irqrestore(&rp->lock, flags);
1689 } 1689 }
1690 return 0; 1690 return 0;
1691} 1691}
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1721 rp->maxactive = num_possible_cpus(); 1721 rp->maxactive = num_possible_cpus();
1722#endif 1722#endif
1723 } 1723 }
1724 spin_lock_init(&rp->lock); 1724 raw_spin_lock_init(&rp->lock);
1725 INIT_HLIST_HEAD(&rp->free_instances); 1725 INIT_HLIST_HEAD(&rp->free_instances);
1726 for (i = 0; i < rp->maxactive; i++) { 1726 for (i = 0; i < rp->maxactive; i++) {
1727 inst = kmalloc(sizeof(struct kretprobe_instance) + 1727 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1960 INIT_HLIST_HEAD(&kprobe_table[i]); 1960 INIT_HLIST_HEAD(&kprobe_table[i]);
1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1962 spin_lock_init(&(kretprobe_table_locks[i].lock)); 1962 raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
1963 } 1963 }
1964 1964
1965 /* 1965 /*
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2198 const char __user *user_buf, size_t count, loff_t *ppos) 2198 const char __user *user_buf, size_t count, loff_t *ppos)
2199{ 2199{
2200 char buf[32]; 2200 char buf[32];
2201 int buf_size; 2201 size_t buf_size;
2202 2202
2203 buf_size = min(count, (sizeof(buf)-1)); 2203 buf_size = min(count, (sizeof(buf)-1));
2204 if (copy_from_user(buf, user_buf, buf_size)) 2204 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3b053c04dd86..4e316e1acf58 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,10 +11,11 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/sysfs.h> 13#include <linux/sysfs.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/stat.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/capability.h> 20#include <linux/capability.h>
20 21
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ba7cccb4994..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,7 +12,7 @@
12#include <linux/cpuset.h> 12#include <linux/cpuset.h>
13#include <linux/unistd.h> 13#include <linux/unistd.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
59EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
60 60
61/** 61/**
62 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 *
65 * kthread_should_stop() for freezable kthreads, which will enter
66 * refrigerator if necessary. This function is safe from kthread_stop() /
67 * freezer deadlock and freezable kthreads should use this function instead
68 * of calling try_to_freeze() directly.
69 */
70bool kthread_freezable_should_stop(bool *was_frozen)
71{
72 bool frozen = false;
73
74 might_sleep();
75
76 if (unlikely(freezing(current)))
77 frozen = __refrigerator(true);
78
79 if (was_frozen)
80 *was_frozen = frozen;
81
82 return kthread_should_stop();
83}
84EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
85
86/**
62 * kthread_data - return data value specified on kthread creation 87 * kthread_data - return data value specified on kthread creation
63 * @task: kthread task in question 88 * @task: kthread task in question
64 * 89 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
257 set_cpus_allowed_ptr(tsk, cpu_all_mask); 282 set_cpus_allowed_ptr(tsk, cpu_all_mask);
258 set_mems_allowed(node_states[N_HIGH_MEMORY]); 283 set_mems_allowed(node_states[N_HIGH_MEMORY]);
259 284
260 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 285 current->flags |= PF_NOFREEZE;
261 286
262 for (;;) { 287 for (;;) {
263 set_current_state(TASK_INTERRUPTIBLE); 288 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e10413..a462b317f9a0 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
53#include <linux/notifier.h> 53#include <linux/notifier.h>
54#include <linux/spinlock.h> 54#include <linux/spinlock.h>
55#include <linux/proc_fs.h> 55#include <linux/proc_fs.h>
56#include <linux/module.h> 56#include <linux/export.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
60 60
61static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_RAW_SPINLOCK(latency_lock);
62 62
63#define MAXLR 128 63#define MAXLR 128
64static struct latency_record latency_record[MAXLR]; 64static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
72 if (!latencytop_enabled) 72 if (!latencytop_enabled)
73 return; 73 return;
74 74
75 spin_lock_irqsave(&latency_lock, flags); 75 raw_spin_lock_irqsave(&latency_lock, flags);
76 memset(&p->latency_record, 0, sizeof(p->latency_record)); 76 memset(&p->latency_record, 0, sizeof(p->latency_record));
77 p->latency_record_count = 0; 77 p->latency_record_count = 0;
78 spin_unlock_irqrestore(&latency_lock, flags); 78 raw_spin_unlock_irqrestore(&latency_lock, flags);
79} 79}
80 80
81static void clear_global_latency_tracing(void) 81static void clear_global_latency_tracing(void)
82{ 82{
83 unsigned long flags; 83 unsigned long flags;
84 84
85 spin_lock_irqsave(&latency_lock, flags); 85 raw_spin_lock_irqsave(&latency_lock, flags);
86 memset(&latency_record, 0, sizeof(latency_record)); 86 memset(&latency_record, 0, sizeof(latency_record));
87 spin_unlock_irqrestore(&latency_lock, flags); 87 raw_spin_unlock_irqrestore(&latency_lock, flags);
88} 88}
89 89
90static void __sched 90static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
190 lat.max = usecs; 190 lat.max = usecs;
191 store_stacktrace(tsk, &lat); 191 store_stacktrace(tsk, &lat);
192 192
193 spin_lock_irqsave(&latency_lock, flags); 193 raw_spin_lock_irqsave(&latency_lock, flags);
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
232 232
233out_unlock: 233out_unlock:
234 spin_unlock_irqrestore(&latency_lock, flags); 234 raw_spin_unlock_irqrestore(&latency_lock, flags);
235} 235}
236 236
237static int lstats_show(struct seq_file *m, void *v) 237static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h> 46#include <linux/gfp.h>
47#include <linux/kmemcheck.h>
47 48
48#include <asm/sections.h> 49#include <asm/sections.h>
49 50
@@ -96,8 +97,13 @@ static int graph_lock(void)
96 97
97static inline int graph_unlock(void) 98static inline int graph_unlock(void)
98{ 99{
99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) 100 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
101 /*
102 * The lockdep graph lock isn't locked while we expect it to
103 * be, we're confused now, bye!
104 */
100 return DEBUG_LOCKS_WARN_ON(1); 105 return DEBUG_LOCKS_WARN_ON(1);
106 }
101 107
102 current->lockdep_recursion--; 108 current->lockdep_recursion--;
103 arch_spin_unlock(&lockdep_lock); 109 arch_spin_unlock(&lockdep_lock);
@@ -134,6 +140,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
134static inline struct lock_class *hlock_class(struct held_lock *hlock) 140static inline struct lock_class *hlock_class(struct held_lock *hlock)
135{ 141{
136 if (!hlock->class_idx) { 142 if (!hlock->class_idx) {
143 /*
144 * Someone passed in garbage, we give up.
145 */
137 DEBUG_LOCKS_WARN_ON(1); 146 DEBUG_LOCKS_WARN_ON(1);
138 return NULL; 147 return NULL;
139 } 148 }
@@ -422,6 +431,7 @@ unsigned int max_lockdep_depth;
422 * about it later on, in lockdep_info(). 431 * about it later on, in lockdep_info().
423 */ 432 */
424static int lockdep_init_error; 433static int lockdep_init_error;
434static const char *lock_init_error;
425static unsigned long lockdep_init_trace_data[20]; 435static unsigned long lockdep_init_trace_data[20];
426static struct stack_trace lockdep_init_trace = { 436static struct stack_trace lockdep_init_trace = {
427 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -490,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 500 usage[i] = '\0';
491} 501}
492 502
493static int __print_lock_name(struct lock_class *class) 503static void __print_lock_name(struct lock_class *class)
494{ 504{
495 char str[KSYM_NAME_LEN]; 505 char str[KSYM_NAME_LEN];
496 const char *name; 506 const char *name;
497 507
498 name = class->name; 508 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
505static void print_lock_name(struct lock_class *class)
506{
507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
508 const char *name;
509
510 get_usage_chars(class, usage);
511
512 name = class->name;
513 if (!name) { 509 if (!name) {
514 name = __get_key_name(class->key, str); 510 name = __get_key_name(class->key, str);
515 printk(" (%s", name); 511 printk("%s", name);
516 } else { 512 } else {
517 printk(" (%s", name); 513 printk("%s", name);
518 if (class->name_version > 1) 514 if (class->name_version > 1)
519 printk("#%d", class->name_version); 515 printk("#%d", class->name_version);
520 if (class->subclass) 516 if (class->subclass)
521 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
522 } 518 }
519}
520
521static void print_lock_name(struct lock_class *class)
522{
523 char usage[LOCK_USAGE_CHARS];
524
525 get_usage_chars(class, usage);
526
527 printk(" (");
528 __print_lock_name(class);
523 printk("){%s}", usage); 529 printk("){%s}", usage);
524} 530}
525 531
@@ -559,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
559 } 565 }
560} 566}
561 567
562static void print_kernel_version(void) 568static void print_kernel_ident(void)
563{ 569{
564 printk("%s %.*s\n", init_utsname()->release, 570 printk("%s %.*s %s\n", init_utsname()->release,
565 (int)strcspn(init_utsname()->version, " "), 571 (int)strcspn(init_utsname()->version, " "),
566 init_utsname()->version); 572 init_utsname()->version,
573 print_tainted());
567} 574}
568 575
569static int very_verbose(struct lock_class *class) 576static int very_verbose(struct lock_class *class)
@@ -647,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
647 if (unlikely(!lockdep_initialized)) { 654 if (unlikely(!lockdep_initialized)) {
648 lockdep_init(); 655 lockdep_init();
649 lockdep_init_error = 1; 656 lockdep_init_error = 1;
657 lock_init_error = lock->name;
650 save_stack_trace(&lockdep_init_trace); 658 save_stack_trace(&lockdep_init_trace);
651 } 659 }
652#endif 660#endif
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
687 */ 695 */
688 list_for_each_entry(class, hash_head, hash_entry) { 696 list_for_each_entry(class, hash_head, hash_entry) {
689 if (class->key == key) { 697 if (class->key == key) {
698 /*
699 * Huh! same key, different name? Did someone trample
700 * on some memory? We're most confused.
701 */
690 WARN_ON_ONCE(class->name != lock->name); 702 WARN_ON_ONCE(class->name != lock->name);
691 return class; 703 return class;
692 } 704 }
@@ -710,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
710 722
711 class = look_up_lock_class(lock, subclass); 723 class = look_up_lock_class(lock, subclass);
712 if (likely(class)) 724 if (likely(class))
713 return class; 725 goto out_set_class_cache;
714 726
715 /* 727 /*
716 * Debug-check: all keys must be persistent! 728 * Debug-check: all keys must be persistent!
@@ -795,11 +807,16 @@ out_unlock_set:
795 graph_unlock(); 807 graph_unlock();
796 raw_local_irq_restore(flags); 808 raw_local_irq_restore(flags);
797 809
810out_set_class_cache:
798 if (!subclass || force) 811 if (!subclass || force)
799 lock->class_cache[0] = class; 812 lock->class_cache[0] = class;
800 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
801 lock->class_cache[subclass] = class; 814 lock->class_cache[subclass] = class;
802 815
816 /*
817 * Hash collision, did we smoke some? We found a class with a matching
818 * hash but the subclass -- which is hashed in -- didn't match.
819 */
803 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 820 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
804 return NULL; 821 return NULL;
805 822
@@ -926,7 +943,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
926 unsigned long nr; 943 unsigned long nr;
927 944
928 nr = lock - list_entries; 945 nr = lock - list_entries;
929 WARN_ON(nr >= nr_list_entries); 946 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
930 lock->parent = parent; 947 lock->parent = parent;
931 lock->class->dep_gen_id = lockdep_dependency_gen_id; 948 lock->class->dep_gen_id = lockdep_dependency_gen_id;
932} 949}
@@ -936,7 +953,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
936 unsigned long nr; 953 unsigned long nr;
937 954
938 nr = lock - list_entries; 955 nr = lock - list_entries;
939 WARN_ON(nr >= nr_list_entries); 956 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
940 return lock->class->dep_gen_id == lockdep_dependency_gen_id; 957 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
941} 958}
942 959
@@ -1129,10 +1146,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1129 if (debug_locks_silent) 1146 if (debug_locks_silent)
1130 return 0; 1147 return 0;
1131 1148
1132 printk("\n=======================================================\n"); 1149 printk("\n");
1133 printk( "[ INFO: possible circular locking dependency detected ]\n"); 1150 printk("======================================================\n");
1134 print_kernel_version(); 1151 printk("[ INFO: possible circular locking dependency detected ]\n");
1135 printk( "-------------------------------------------------------\n"); 1152 print_kernel_ident();
1153 printk("-------------------------------------------------------\n");
1136 printk("%s/%d is trying to acquire lock:\n", 1154 printk("%s/%d is trying to acquire lock:\n",
1137 curr->comm, task_pid_nr(curr)); 1155 curr->comm, task_pid_nr(curr));
1138 print_lock(check_src); 1156 print_lock(check_src);
@@ -1196,6 +1214,9 @@ static noinline int print_bfs_bug(int ret)
1196 if (!debug_locks_off_graph_unlock()) 1214 if (!debug_locks_off_graph_unlock())
1197 return 0; 1215 return 0;
1198 1216
1217 /*
1218 * Breadth-first-search failed, graph got corrupted?
1219 */
1199 WARN(1, "lockdep bfs error:%d\n", ret); 1220 WARN(1, "lockdep bfs error:%d\n", ret);
1200 1221
1201 return 0; 1222 return 0;
@@ -1463,11 +1484,12 @@ print_bad_irq_dependency(struct task_struct *curr,
1463 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1484 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1464 return 0; 1485 return 0;
1465 1486
1466 printk("\n======================================================\n"); 1487 printk("\n");
1467 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1488 printk("======================================================\n");
1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1468 irqclass, irqclass); 1490 irqclass, irqclass);
1469 print_kernel_version(); 1491 print_kernel_ident();
1470 printk( "------------------------------------------------------\n"); 1492 printk("------------------------------------------------------\n");
1471 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1472 curr->comm, task_pid_nr(curr), 1494 curr->comm, task_pid_nr(curr),
1473 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1495 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1714,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1692 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1714 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1693 return 0; 1715 return 0;
1694 1716
1695 printk("\n=============================================\n"); 1717 printk("\n");
1696 printk( "[ INFO: possible recursive locking detected ]\n"); 1718 printk("=============================================\n");
1697 print_kernel_version(); 1719 printk("[ INFO: possible recursive locking detected ]\n");
1698 printk( "---------------------------------------------\n"); 1720 print_kernel_ident();
1721 printk("---------------------------------------------\n");
1699 printk("%s/%d is trying to acquire lock:\n", 1722 printk("%s/%d is trying to acquire lock:\n",
1700 curr->comm, task_pid_nr(curr)); 1723 curr->comm, task_pid_nr(curr));
1701 print_lock(next); 1724 print_lock(next);
@@ -1944,6 +1967,11 @@ out_bug:
1944 if (!debug_locks_off_graph_unlock()) 1967 if (!debug_locks_off_graph_unlock())
1945 return 0; 1968 return 0;
1946 1969
1970 /*
1971 * Clearly we all shouldn't be here, but since we made it we
1972 * can reliable say we messed up our state. See the above two
1973 * gotos for reasons why we could possibly end up here.
1974 */
1947 WARN_ON(1); 1975 WARN_ON(1);
1948 1976
1949 return 0; 1977 return 0;
@@ -1975,6 +2003,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1975 struct held_lock *hlock_curr, *hlock_next; 2003 struct held_lock *hlock_curr, *hlock_next;
1976 int i, j; 2004 int i, j;
1977 2005
2006 /*
2007 * We might need to take the graph lock, ensure we've got IRQs
2008 * disabled to make this an IRQ-safe lock.. for recursion reasons
2009 * lockdep won't complain about its own locking errors.
2010 */
1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2011 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1979 return 0; 2012 return 0;
1980 /* 2013 /*
@@ -2126,6 +2159,10 @@ static void check_chain_key(struct task_struct *curr)
2126 hlock = curr->held_locks + i; 2159 hlock = curr->held_locks + i;
2127 if (chain_key != hlock->prev_chain_key) { 2160 if (chain_key != hlock->prev_chain_key) {
2128 debug_locks_off(); 2161 debug_locks_off();
2162 /*
2163 * We got mighty confused, our chain keys don't match
2164 * with what we expect, someone trample on our task state?
2165 */
2129 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", 2166 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
2130 curr->lockdep_depth, i, 2167 curr->lockdep_depth, i,
2131 (unsigned long long)chain_key, 2168 (unsigned long long)chain_key,
@@ -2133,6 +2170,9 @@ static void check_chain_key(struct task_struct *curr)
2133 return; 2170 return;
2134 } 2171 }
2135 id = hlock->class_idx - 1; 2172 id = hlock->class_idx - 1;
2173 /*
2174 * Whoops ran out of static storage again?
2175 */
2136 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 2176 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2137 return; 2177 return;
2138 2178
@@ -2144,6 +2184,10 @@ static void check_chain_key(struct task_struct *curr)
2144 } 2184 }
2145 if (chain_key != curr->curr_chain_key) { 2185 if (chain_key != curr->curr_chain_key) {
2146 debug_locks_off(); 2186 debug_locks_off();
2187 /*
2188 * More smoking hash instead of calculating it, damn see these
2189 * numbers float.. I bet that a pink elephant stepped on my memory.
2190 */
2147 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", 2191 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
2148 curr->lockdep_depth, i, 2192 curr->lockdep_depth, i,
2149 (unsigned long long)chain_key, 2193 (unsigned long long)chain_key,
@@ -2177,10 +2221,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2177 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2221 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2178 return 0; 2222 return 0;
2179 2223
2180 printk("\n=================================\n"); 2224 printk("\n");
2181 printk( "[ INFO: inconsistent lock state ]\n"); 2225 printk("=================================\n");
2182 print_kernel_version(); 2226 printk("[ INFO: inconsistent lock state ]\n");
2183 printk( "---------------------------------\n"); 2227 print_kernel_ident();
2228 printk("---------------------------------\n");
2184 2229
2185 printk("inconsistent {%s} -> {%s} usage.\n", 2230 printk("inconsistent {%s} -> {%s} usage.\n",
2186 usage_str[prev_bit], usage_str[new_bit]); 2231 usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2286,11 @@ print_irq_inversion_bug(struct task_struct *curr,
2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2286 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2242 return 0; 2287 return 0;
2243 2288
2244 printk("\n=========================================================\n"); 2289 printk("\n");
2245 printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); 2290 printk("=========================================================\n");
2246 print_kernel_version(); 2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2247 printk( "---------------------------------------------------------\n"); 2292 print_kernel_ident();
2293 printk("---------------------------------------------------------\n");
2248 printk("%s/%d just changed the state of lock:\n", 2294 printk("%s/%d just changed the state of lock:\n",
2249 curr->comm, task_pid_nr(curr)); 2295 curr->comm, task_pid_nr(curr));
2250 print_lock(this); 2296 print_lock(this);
@@ -2525,12 +2571,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 return; 2571 return;
2526 } 2572 }
2527 2573
2574 /*
2575 * We're enabling irqs and according to our state above irqs weren't
2576 * already enabled, yet we find the hardware thinks they are in fact
2577 * enabled.. someone messed up their IRQ state tracing.
2578 */
2528 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2579 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2529 return; 2580 return;
2530 2581
2582 /*
2583 * See the fine text that goes along with this variable definition.
2584 */
2531 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2585 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2532 return; 2586 return;
2533 2587
2588 /*
2589 * Can't allow enabling interrupts while in an interrupt handler,
2590 * that's general bad form and such. Recursion, limited stack etc..
2591 */
2534 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2592 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2535 return; 2593 return;
2536 2594
@@ -2558,6 +2616,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
2558 if (unlikely(!debug_locks || current->lockdep_recursion)) 2616 if (unlikely(!debug_locks || current->lockdep_recursion))
2559 return; 2617 return;
2560 2618
2619 /*
2620 * So we're supposed to get called after you mask local IRQs, but for
2621 * some reason the hardware doesn't quite think you did a proper job.
2622 */
2561 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2623 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2562 return; 2624 return;
2563 2625
@@ -2590,6 +2652,10 @@ void trace_softirqs_on(unsigned long ip)
2590 if (unlikely(!debug_locks || current->lockdep_recursion)) 2652 if (unlikely(!debug_locks || current->lockdep_recursion))
2591 return; 2653 return;
2592 2654
2655 /*
2656 * We fancy IRQs being disabled here, see softirq.c, avoids
2657 * funny state and nesting things.
2658 */
2593 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2659 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2594 return; 2660 return;
2595 2661
@@ -2626,6 +2692,9 @@ void trace_softirqs_off(unsigned long ip)
2626 if (unlikely(!debug_locks || current->lockdep_recursion)) 2692 if (unlikely(!debug_locks || current->lockdep_recursion))
2627 return; 2693 return;
2628 2694
2695 /*
2696 * We fancy IRQs being disabled here, see softirq.c
2697 */
2629 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2698 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2630 return; 2699 return;
2631 2700
@@ -2637,6 +2706,9 @@ void trace_softirqs_off(unsigned long ip)
2637 curr->softirq_disable_ip = ip; 2706 curr->softirq_disable_ip = ip;
2638 curr->softirq_disable_event = ++curr->irq_events; 2707 curr->softirq_disable_event = ++curr->irq_events;
2639 debug_atomic_inc(softirqs_off_events); 2708 debug_atomic_inc(softirqs_off_events);
2709 /*
2710 * Whoops, we wanted softirqs off, so why aren't they?
2711 */
2640 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2712 DEBUG_LOCKS_WARN_ON(!softirq_count());
2641 } else 2713 } else
2642 debug_atomic_inc(redundant_softirqs_off); 2714 debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2733,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2661 if (!(gfp_mask & __GFP_FS)) 2733 if (!(gfp_mask & __GFP_FS))
2662 return; 2734 return;
2663 2735
2736 /*
2737 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
2738 */
2664 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) 2739 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2665 return; 2740 return;
2666 2741
@@ -2773,13 +2848,13 @@ static int separate_irq_context(struct task_struct *curr,
2773 return 0; 2848 return 0;
2774} 2849}
2775 2850
2776#else 2851#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2777 2852
2778static inline 2853static inline
2779int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 2854int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2780 enum lock_usage_bit new_bit) 2855 enum lock_usage_bit new_bit)
2781{ 2856{
2782 WARN_ON(1); 2857 WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
2783 return 1; 2858 return 1;
2784} 2859}
2785 2860
@@ -2799,7 +2874,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
2799{ 2874{
2800} 2875}
2801 2876
2802#endif 2877#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2803 2878
2804/* 2879/*
2805 * Mark a lock with a usage bit, and validate the state transition: 2880 * Mark a lock with a usage bit, and validate the state transition:
@@ -2874,12 +2949,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2874void lockdep_init_map(struct lockdep_map *lock, const char *name, 2949void lockdep_init_map(struct lockdep_map *lock, const char *name,
2875 struct lock_class_key *key, int subclass) 2950 struct lock_class_key *key, int subclass)
2876{ 2951{
2877 memset(lock, 0, sizeof(*lock)); 2952 int i;
2953
2954 kmemcheck_mark_initialized(lock, sizeof(*lock));
2955
2956 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2957 lock->class_cache[i] = NULL;
2878 2958
2879#ifdef CONFIG_LOCK_STAT 2959#ifdef CONFIG_LOCK_STAT
2880 lock->cpu = raw_smp_processor_id(); 2960 lock->cpu = raw_smp_processor_id();
2881#endif 2961#endif
2882 2962
2963 /*
2964 * Can't be having no nameless bastards around this place!
2965 */
2883 if (DEBUG_LOCKS_WARN_ON(!name)) { 2966 if (DEBUG_LOCKS_WARN_ON(!name)) {
2884 lock->name = "NULL"; 2967 lock->name = "NULL";
2885 return; 2968 return;
@@ -2887,6 +2970,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2887 2970
2888 lock->name = name; 2971 lock->name = name;
2889 2972
2973 /*
2974 * No key, no joy, we need to hash something.
2975 */
2890 if (DEBUG_LOCKS_WARN_ON(!key)) 2976 if (DEBUG_LOCKS_WARN_ON(!key))
2891 return; 2977 return;
2892 /* 2978 /*
@@ -2894,6 +2980,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2894 */ 2980 */
2895 if (!static_obj(key)) { 2981 if (!static_obj(key)) {
2896 printk("BUG: key %p not in .data!\n", key); 2982 printk("BUG: key %p not in .data!\n", key);
2983 /*
2984 * What it says above ^^^^^, I suggest you read it.
2985 */
2897 DEBUG_LOCKS_WARN_ON(1); 2986 DEBUG_LOCKS_WARN_ON(1);
2898 return; 2987 return;
2899 } 2988 }
@@ -2932,6 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2932 if (unlikely(!debug_locks)) 3021 if (unlikely(!debug_locks))
2933 return 0; 3022 return 0;
2934 3023
3024 /*
3025 * Lockdep should run with IRQs disabled, otherwise we could
3026 * get an interrupt which would want to take locks, which would
3027 * end up in lockdep and have you got a head-ache already?
3028 */
2935 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3029 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2936 return 0; 3030 return 0;
2937 3031
@@ -2963,6 +3057,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963 * dependency checks are done) 3057 * dependency checks are done)
2964 */ 3058 */
2965 depth = curr->lockdep_depth; 3059 depth = curr->lockdep_depth;
3060 /*
3061 * Ran out of static storage for our per-task lock stack again have we?
3062 */
2966 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 3063 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2967 return 0; 3064 return 0;
2968 3065
@@ -2981,6 +3078,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2981 } 3078 }
2982 3079
2983 hlock = curr->held_locks + depth; 3080 hlock = curr->held_locks + depth;
3081 /*
3082 * Plain impossible, we just registered it and checked it weren't no
3083 * NULL like.. I bet this mushroom I ate was good!
3084 */
2984 if (DEBUG_LOCKS_WARN_ON(!class)) 3085 if (DEBUG_LOCKS_WARN_ON(!class))
2985 return 0; 3086 return 0;
2986 hlock->class_idx = class_idx; 3087 hlock->class_idx = class_idx;
@@ -3015,11 +3116,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3015 * the hash, not class->key. 3116 * the hash, not class->key.
3016 */ 3117 */
3017 id = class - lock_classes; 3118 id = class - lock_classes;
3119 /*
3120 * Whoops, we did it again.. ran straight out of our static allocation.
3121 */
3018 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 3122 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
3019 return 0; 3123 return 0;
3020 3124
3021 chain_key = curr->curr_chain_key; 3125 chain_key = curr->curr_chain_key;
3022 if (!depth) { 3126 if (!depth) {
3127 /*
3128 * How can we have a chain hash when we ain't got no keys?!
3129 */
3023 if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) 3130 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
3024 return 0; 3131 return 0;
3025 chain_head = 1; 3132 chain_head = 1;
@@ -3065,9 +3172,11 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3065 if (debug_locks_silent) 3172 if (debug_locks_silent)
3066 return 0; 3173 return 0;
3067 3174
3068 printk("\n=====================================\n"); 3175 printk("\n");
3069 printk( "[ BUG: bad unlock balance detected! ]\n"); 3176 printk("=====================================\n");
3070 printk( "-------------------------------------\n"); 3177 printk("[ BUG: bad unlock balance detected! ]\n");
3178 print_kernel_ident();
3179 printk("-------------------------------------\n");
3071 printk("%s/%d is trying to release lock (", 3180 printk("%s/%d is trying to release lock (",
3072 curr->comm, task_pid_nr(curr)); 3181 curr->comm, task_pid_nr(curr));
3073 print_lockdep_cache(lock); 3182 print_lockdep_cache(lock);
@@ -3091,6 +3200,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3091{ 3200{
3092 if (unlikely(!debug_locks)) 3201 if (unlikely(!debug_locks))
3093 return 0; 3202 return 0;
3203 /*
3204 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
3205 */
3094 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3206 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3095 return 0; 3207 return 0;
3096 3208
@@ -3120,6 +3232,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3120 if (!class) 3232 if (!class)
3121 return 0; 3233 return 0;
3122 3234
3235 /*
3236 * References, but not a lock we're actually ref-counting?
3237 * State got messed up, follow the sites that change ->references
3238 * and try to make sense of it.
3239 */
3123 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3240 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
3124 return 0; 3241 return 0;
3125 3242
@@ -3142,6 +3259,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3142 int i; 3259 int i;
3143 3260
3144 depth = curr->lockdep_depth; 3261 depth = curr->lockdep_depth;
3262 /*
3263 * This function is about (re)setting the class of a held lock,
3264 * yet we're not actually holding any locks. Naughty user!
3265 */
3145 if (DEBUG_LOCKS_WARN_ON(!depth)) 3266 if (DEBUG_LOCKS_WARN_ON(!depth))
3146 return 0; 3267 return 0;
3147 3268
@@ -3177,6 +3298,10 @@ found_it:
3177 return 0; 3298 return 0;
3178 } 3299 }
3179 3300
3301 /*
3302 * I took it apart and put it back together again, except now I have
3303 * these 'spare' parts.. where shall I put them.
3304 */
3180 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 3305 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
3181 return 0; 3306 return 0;
3182 return 1; 3307 return 1;
@@ -3201,6 +3326,10 @@ lock_release_non_nested(struct task_struct *curr,
3201 * of held locks: 3326 * of held locks:
3202 */ 3327 */
3203 depth = curr->lockdep_depth; 3328 depth = curr->lockdep_depth;
3329 /*
3330 * So we're all set to release this lock.. wait what lock? We don't
3331 * own any locks, you've been drinking again?
3332 */
3204 if (DEBUG_LOCKS_WARN_ON(!depth)) 3333 if (DEBUG_LOCKS_WARN_ON(!depth))
3205 return 0; 3334 return 0;
3206 3335
@@ -3253,6 +3382,10 @@ found_it:
3253 return 0; 3382 return 0;
3254 } 3383 }
3255 3384
3385 /*
3386 * We had N bottles of beer on the wall, we drank one, but now
3387 * there's not N-1 bottles of beer left on the wall...
3388 */
3256 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) 3389 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
3257 return 0; 3390 return 0;
3258 return 1; 3391 return 1;
@@ -3283,6 +3416,9 @@ static int lock_release_nested(struct task_struct *curr,
3283 return lock_release_non_nested(curr, lock, ip); 3416 return lock_release_non_nested(curr, lock, ip);
3284 curr->lockdep_depth--; 3417 curr->lockdep_depth--;
3285 3418
3419 /*
3420 * No more locks, but somehow we've got hash left over, who left it?
3421 */
3286 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) 3422 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
3287 return 0; 3423 return 0;
3288 3424
@@ -3365,10 +3501,13 @@ static void check_flags(unsigned long flags)
3365 * check if not in hardirq contexts: 3501 * check if not in hardirq contexts:
3366 */ 3502 */
3367 if (!hardirq_count()) { 3503 if (!hardirq_count()) {
3368 if (softirq_count()) 3504 if (softirq_count()) {
3505 /* like the above, but with softirqs */
3369 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); 3506 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
3370 else 3507 } else {
3508 /* lick the above, does it taste good? */
3371 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); 3509 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
3510 }
3372 } 3511 }
3373 3512
3374 if (!debug_locks) 3513 if (!debug_locks)
@@ -3478,9 +3617,11 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3478 if (debug_locks_silent) 3617 if (debug_locks_silent)
3479 return 0; 3618 return 0;
3480 3619
3481 printk("\n=================================\n"); 3620 printk("\n");
3482 printk( "[ BUG: bad contention detected! ]\n"); 3621 printk("=================================\n");
3483 printk( "---------------------------------\n"); 3622 printk("[ BUG: bad contention detected! ]\n");
3623 print_kernel_ident();
3624 printk("---------------------------------\n");
3484 printk("%s/%d is trying to contend lock (", 3625 printk("%s/%d is trying to contend lock (",
3485 curr->comm, task_pid_nr(curr)); 3626 curr->comm, task_pid_nr(curr));
3486 print_lockdep_cache(lock); 3627 print_lockdep_cache(lock);
@@ -3506,6 +3647,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3506 int i, contention_point, contending_point; 3647 int i, contention_point, contending_point;
3507 3648
3508 depth = curr->lockdep_depth; 3649 depth = curr->lockdep_depth;
3650 /*
3651 * Whee, we contended on this lock, except it seems we're not
3652 * actually trying to acquire anything much at all..
3653 */
3509 if (DEBUG_LOCKS_WARN_ON(!depth)) 3654 if (DEBUG_LOCKS_WARN_ON(!depth))
3510 return; 3655 return;
3511 3656
@@ -3555,6 +3700,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3555 int i, cpu; 3700 int i, cpu;
3556 3701
3557 depth = curr->lockdep_depth; 3702 depth = curr->lockdep_depth;
3703 /*
3704 * Yay, we acquired ownership of this lock we didn't try to
3705 * acquire, how the heck did that happen?
3706 */
3558 if (DEBUG_LOCKS_WARN_ON(!depth)) 3707 if (DEBUG_LOCKS_WARN_ON(!depth))
3559 return; 3708 return;
3560 3709
@@ -3759,8 +3908,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3759 match |= class == lock->class_cache[j]; 3908 match |= class == lock->class_cache[j];
3760 3909
3761 if (unlikely(match)) { 3910 if (unlikely(match)) {
3762 if (debug_locks_off_graph_unlock()) 3911 if (debug_locks_off_graph_unlock()) {
3912 /*
3913 * We all just reset everything, how did it match?
3914 */
3763 WARN_ON(1); 3915 WARN_ON(1);
3916 }
3764 goto out_restore; 3917 goto out_restore;
3765 } 3918 }
3766 } 3919 }
@@ -3823,7 +3976,8 @@ void __init lockdep_info(void)
3823 3976
3824#ifdef CONFIG_DEBUG_LOCKDEP 3977#ifdef CONFIG_DEBUG_LOCKDEP
3825 if (lockdep_init_error) { 3978 if (lockdep_init_error) {
3826 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); 3979 printk("WARNING: lockdep init error! lock-%s was acquired"
3980 "before lockdep_init\n", lock_init_error);
3827 printk("Call stack leading to lockdep invocation was:\n"); 3981 printk("Call stack leading to lockdep invocation was:\n");
3828 print_stack_trace(&lockdep_init_trace, 0); 3982 print_stack_trace(&lockdep_init_trace, 0);
3829 } 3983 }
@@ -3839,9 +3993,11 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3839 if (debug_locks_silent) 3993 if (debug_locks_silent)
3840 return; 3994 return;
3841 3995
3842 printk("\n=========================\n"); 3996 printk("\n");
3843 printk( "[ BUG: held lock freed! ]\n"); 3997 printk("=========================\n");
3844 printk( "-------------------------\n"); 3998 printk("[ BUG: held lock freed! ]\n");
3999 print_kernel_ident();
4000 printk("-------------------------\n");
3845 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4001 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3846 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4002 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3847 print_lock(hlock); 4003 print_lock(hlock);
@@ -3895,9 +4051,11 @@ static void print_held_locks_bug(struct task_struct *curr)
3895 if (debug_locks_silent) 4051 if (debug_locks_silent)
3896 return; 4052 return;
3897 4053
3898 printk("\n=====================================\n"); 4054 printk("\n");
3899 printk( "[ BUG: lock held at task exit time! ]\n"); 4055 printk("=====================================\n");
3900 printk( "-------------------------------------\n"); 4056 printk("[ BUG: lock held at task exit time! ]\n");
4057 print_kernel_ident();
4058 printk("-------------------------------------\n");
3901 printk("%s/%d is exiting with locks still held!\n", 4059 printk("%s/%d is exiting with locks still held!\n",
3902 curr->comm, task_pid_nr(curr)); 4060 curr->comm, task_pid_nr(curr));
3903 lockdep_print_held_locks(curr); 4061 lockdep_print_held_locks(curr);
@@ -3991,16 +4149,18 @@ void lockdep_sys_exit(void)
3991 if (unlikely(curr->lockdep_depth)) { 4149 if (unlikely(curr->lockdep_depth)) {
3992 if (!debug_locks_off()) 4150 if (!debug_locks_off())
3993 return; 4151 return;
3994 printk("\n================================================\n"); 4152 printk("\n");
3995 printk( "[ BUG: lock held when returning to user space! ]\n"); 4153 printk("================================================\n");
3996 printk( "------------------------------------------------\n"); 4154 printk("[ BUG: lock held when returning to user space! ]\n");
4155 print_kernel_ident();
4156 printk("------------------------------------------------\n");
3997 printk("%s/%d is leaving the kernel with locks still held!\n", 4157 printk("%s/%d is leaving the kernel with locks still held!\n",
3998 curr->comm, curr->pid); 4158 curr->comm, curr->pid);
3999 lockdep_print_held_locks(curr); 4159 lockdep_print_held_locks(curr);
4000 } 4160 }
4001} 4161}
4002 4162
4003void lockdep_rcu_dereference(const char *file, const int line) 4163void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4004{ 4164{
4005 struct task_struct *curr = current; 4165 struct task_struct *curr = current;
4006 4166
@@ -4009,15 +4169,38 @@ void lockdep_rcu_dereference(const char *file, const int line)
4009 return; 4169 return;
4010#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4170#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4011 /* Note: the following can be executed concurrently, so be careful. */ 4171 /* Note: the following can be executed concurrently, so be careful. */
4012 printk("\n===================================================\n"); 4172 printk("\n");
4013 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 4173 printk("===============================\n");
4014 printk( "---------------------------------------------------\n"); 4174 printk("[ INFO: suspicious RCU usage. ]\n");
4015 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 4175 print_kernel_ident();
4016 file, line); 4176 printk("-------------------------------\n");
4177 printk("%s:%d %s!\n", file, line, s);
4017 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4018 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4180
4181 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section
4183 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4184 * considers that CPU to be in an "extended quiescent state",
4185 * which means that RCU will be completely ignoring that CPU.
4186 * Therefore, rcu_read_lock() and friends have absolutely no
4187 * effect on a CPU running in that state. In other words, even if
4188 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4189 * delete data structures out from under it. RCU really has no
4190 * choice here: we need to keep an RCU-free window in idle where
4191 * the CPU may possibly enter into low power mode. This way we can
4192 * notice an extended quiescent state to other CPUs that started a grace
4193 * period. Otherwise we would delay any grace period as long as we run
4194 * in the idle task.
4195 *
4196 * So complain bitterly if someone does call rcu_read_lock(),
4197 * rcu_read_lock_bh() and so on from extended quiescent states.
4198 */
4199 if (rcu_is_cpu_idle())
4200 printk("RCU used illegally from extended quiescent state!\n");
4201
4019 lockdep_print_held_locks(curr); 4202 lockdep_print_held_locks(curr);
4020 printk("\nstack backtrace:\n"); 4203 printk("\nstack backtrace:\n");
4021 dump_stack(); 4204 dump_stack();
4022} 4205}
4023EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); 4206EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 71edd2f60c02..91c32a0b612c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
11 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
12 * 12 *
13 */ 13 */
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f843..acf6ed3ebe81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,7 +16,7 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/module.h> 19#include <linux/export.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
@@ -62,12 +62,6 @@
62#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 63#include <trace/events/module.h>
64 64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(fmt , a...)
69#endif
70
71#ifndef ARCH_SHF_SMALL 65#ifndef ARCH_SHF_SMALL
72#define ARCH_SHF_SMALL 0 66#define ARCH_SHF_SMALL 0
73#endif 67#endif
@@ -138,7 +132,6 @@ struct load_info {
138 unsigned long len; 132 unsigned long len;
139 Elf_Shdr *sechdrs; 133 Elf_Shdr *sechdrs;
140 char *secstrings, *strtab; 134 char *secstrings, *strtab;
141 unsigned long *strmap;
142 unsigned long symoffs, stroffs; 135 unsigned long symoffs, stroffs;
143 struct _ddebug *debug; 136 struct _ddebug *debug;
144 unsigned int num_debug; 137 unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
410 return fsa.sym; 403 return fsa.sym;
411 } 404 }
412 405
413 DEBUGP("Failed to find symbol %s\n", name); 406 pr_debug("Failed to find symbol %s\n", name);
414 return NULL; 407 return NULL;
415} 408}
416EXPORT_SYMBOL_GPL(find_symbol); 409EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
600 593
601 list_for_each_entry(use, &b->source_list, source_list) { 594 list_for_each_entry(use, &b->source_list, source_list) {
602 if (use->source == a) { 595 if (use->source == a) {
603 DEBUGP("%s uses %s!\n", a->name, b->name); 596 pr_debug("%s uses %s!\n", a->name, b->name);
604 return 1; 597 return 1;
605 } 598 }
606 } 599 }
607 DEBUGP("%s does not use %s!\n", a->name, b->name); 600 pr_debug("%s does not use %s!\n", a->name, b->name);
608 return 0; 601 return 0;
609} 602}
610 603
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
619{ 612{
620 struct module_use *use; 613 struct module_use *use;
621 614
622 DEBUGP("Allocating new usage for %s.\n", a->name); 615 pr_debug("Allocating new usage for %s.\n", a->name);
623 use = kmalloc(sizeof(*use), GFP_ATOMIC); 616 use = kmalloc(sizeof(*use), GFP_ATOMIC);
624 if (!use) { 617 if (!use) {
625 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 618 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
663 mutex_lock(&module_mutex); 656 mutex_lock(&module_mutex);
664 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { 657 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
665 struct module *i = use->target; 658 struct module *i = use->target;
666 DEBUGP("%s unusing %s\n", mod->name, i->name); 659 pr_debug("%s unusing %s\n", mod->name, i->name);
667 module_put(i); 660 module_put(i);
668 list_del(&use->source_list); 661 list_del(&use->source_list);
669 list_del(&use->target_list); 662 list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
726 } 719 }
727} 720}
728 721
729unsigned int module_refcount(struct module *mod) 722unsigned long module_refcount(struct module *mod)
730{ 723{
731 unsigned int incs = 0, decs = 0; 724 unsigned long incs = 0, decs = 0;
732 int cpu; 725 int cpu;
733 726
734 for_each_possible_cpu(cpu) 727 for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
761 /* Since we might sleep for some time, release the mutex first */ 754 /* Since we might sleep for some time, release the mutex first */
762 mutex_unlock(&module_mutex); 755 mutex_unlock(&module_mutex);
763 for (;;) { 756 for (;;) {
764 DEBUGP("Looking at refcount...\n"); 757 pr_debug("Looking at refcount...\n");
765 set_current_state(TASK_UNINTERRUPTIBLE); 758 set_current_state(TASK_UNINTERRUPTIBLE);
766 if (module_refcount(mod) == 0) 759 if (module_refcount(mod) == 0)
767 break; 760 break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
804 if (mod->state != MODULE_STATE_LIVE) { 797 if (mod->state != MODULE_STATE_LIVE) {
805 /* FIXME: if (force), slam module count and wake up 798 /* FIXME: if (force), slam module count and wake up
806 waiter --RR */ 799 waiter --RR */
807 DEBUGP("%s already dying\n", mod->name); 800 pr_debug("%s already dying\n", mod->name);
808 ret = -EBUSY; 801 ret = -EBUSY;
809 goto out; 802 goto out;
810 } 803 }
@@ -849,12 +842,32 @@ out:
849 return ret; 842 return ret;
850} 843}
851 844
845static size_t module_flags_taint(struct module *mod, char *buf)
846{
847 size_t l = 0;
848
849 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
850 buf[l++] = 'P';
851 if (mod->taints & (1 << TAINT_OOT_MODULE))
852 buf[l++] = 'O';
853 if (mod->taints & (1 << TAINT_FORCED_MODULE))
854 buf[l++] = 'F';
855 if (mod->taints & (1 << TAINT_CRAP))
856 buf[l++] = 'C';
857 /*
858 * TAINT_FORCED_RMMOD: could be added.
859 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
860 * apply to modules.
861 */
862 return l;
863}
864
852static inline void print_unload_info(struct seq_file *m, struct module *mod) 865static inline void print_unload_info(struct seq_file *m, struct module *mod)
853{ 866{
854 struct module_use *use; 867 struct module_use *use;
855 int printed_something = 0; 868 int printed_something = 0;
856 869
857 seq_printf(m, " %u ", module_refcount(mod)); 870 seq_printf(m, " %lu ", module_refcount(mod));
858 871
859 /* Always include a trailing , so userspace can differentiate 872 /* Always include a trailing , so userspace can differentiate
860 between this and the old multi-field proc format. */ 873 between this and the old multi-field proc format. */
@@ -904,13 +917,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
904static ssize_t show_refcnt(struct module_attribute *mattr, 917static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module_kobject *mk, char *buffer) 918 struct module_kobject *mk, char *buffer)
906{ 919{
907 return sprintf(buffer, "%u\n", module_refcount(mk->mod)); 920 return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
908} 921}
909 922
910static struct module_attribute refcnt = { 923static struct module_attribute modinfo_refcnt =
911 .attr = { .name = "refcnt", .mode = 0444 }, 924 __ATTR(refcnt, 0444, show_refcnt, NULL);
912 .show = show_refcnt,
913};
914 925
915void module_put(struct module *module) 926void module_put(struct module *module)
916{ 927{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
970 return sprintf(buffer, "%s\n", state); 981 return sprintf(buffer, "%s\n", state);
971} 982}
972 983
973static struct module_attribute initstate = { 984static struct module_attribute modinfo_initstate =
974 .attr = { .name = "initstate", .mode = 0444 }, 985 __ATTR(initstate, 0444, show_initstate, NULL);
975 .show = show_initstate,
976};
977 986
978static ssize_t store_uevent(struct module_attribute *mattr, 987static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk, 988 struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
986 return count; 995 return count;
987} 996}
988 997
989struct module_attribute module_uevent = { 998struct module_attribute module_uevent =
990 .attr = { .name = "uevent", .mode = 0200 }, 999 __ATTR(uevent, 0200, NULL, store_uevent);
991 .store = store_uevent, 1000
992}; 1001static ssize_t show_coresize(struct module_attribute *mattr,
1002 struct module_kobject *mk, char *buffer)
1003{
1004 return sprintf(buffer, "%u\n", mk->mod->core_size);
1005}
1006
1007static struct module_attribute modinfo_coresize =
1008 __ATTR(coresize, 0444, show_coresize, NULL);
1009
1010static ssize_t show_initsize(struct module_attribute *mattr,
1011 struct module_kobject *mk, char *buffer)
1012{
1013 return sprintf(buffer, "%u\n", mk->mod->init_size);
1014}
1015
1016static struct module_attribute modinfo_initsize =
1017 __ATTR(initsize, 0444, show_initsize, NULL);
1018
1019static ssize_t show_taint(struct module_attribute *mattr,
1020 struct module_kobject *mk, char *buffer)
1021{
1022 size_t l;
1023
1024 l = module_flags_taint(mk->mod, buffer);
1025 buffer[l++] = '\n';
1026 return l;
1027}
1028
1029static struct module_attribute modinfo_taint =
1030 __ATTR(taint, 0444, show_taint, NULL);
993 1031
994static struct module_attribute *modinfo_attrs[] = { 1032static struct module_attribute *modinfo_attrs[] = {
1033 &module_uevent,
995 &modinfo_version, 1034 &modinfo_version,
996 &modinfo_srcversion, 1035 &modinfo_srcversion,
997 &initstate, 1036 &modinfo_initstate,
998 &module_uevent, 1037 &modinfo_coresize,
1038 &modinfo_initsize,
1039 &modinfo_taint,
999#ifdef CONFIG_MODULE_UNLOAD 1040#ifdef CONFIG_MODULE_UNLOAD
1000 &refcnt, 1041 &modinfo_refcnt,
1001#endif 1042#endif
1002 NULL, 1043 NULL,
1003}; 1044};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
1057 1098
1058 if (versions[i].crc == maybe_relocated(*crc, crc_owner)) 1099 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 1100 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 1101 pr_debug("Found checksum %lX vs module %lX\n",
1061 maybe_relocated(*crc, crc_owner), versions[i].crc); 1102 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 1103 goto bad_version;
1063 } 1104 }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1834 case SHN_COMMON: 1875 case SHN_COMMON:
1835 /* We compiled with -fno-common. These are not 1876 /* We compiled with -fno-common. These are not
1836 supposed to happen. */ 1877 supposed to happen. */
1837 DEBUGP("Common symbol: %s\n", name); 1878 pr_debug("Common symbol: %s\n", name);
1838 printk("%s: please compile with -fno-common\n", 1879 printk("%s: please compile with -fno-common\n",
1839 mod->name); 1880 mod->name);
1840 ret = -ENOEXEC; 1881 ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1842 1883
1843 case SHN_ABS: 1884 case SHN_ABS:
1844 /* Don't need to do anything */ 1885 /* Don't need to do anything */
1845 DEBUGP("Absolute symbol: 0x%08lx\n", 1886 pr_debug("Absolute symbol: 0x%08lx\n",
1846 (long)sym[i].st_value); 1887 (long)sym[i].st_value);
1847 break; 1888 break;
1848 1889
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1966 for (i = 0; i < info->hdr->e_shnum; i++) 2007 for (i = 0; i < info->hdr->e_shnum; i++)
1967 info->sechdrs[i].sh_entsize = ~0UL; 2008 info->sechdrs[i].sh_entsize = ~0UL;
1968 2009
1969 DEBUGP("Core section allocation order:\n"); 2010 pr_debug("Core section allocation order:\n");
1970 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2011 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1971 for (i = 0; i < info->hdr->e_shnum; ++i) { 2012 for (i = 0; i < info->hdr->e_shnum; ++i) {
1972 Elf_Shdr *s = &info->sechdrs[i]; 2013 Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1978 || strstarts(sname, ".init")) 2019 || strstarts(sname, ".init"))
1979 continue; 2020 continue;
1980 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 2021 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1981 DEBUGP("\t%s\n", name); 2022 pr_debug("\t%s\n", sname);
1982 } 2023 }
1983 switch (m) { 2024 switch (m) {
1984 case 0: /* executable */ 2025 case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1995 } 2036 }
1996 } 2037 }
1997 2038
1998 DEBUGP("Init section allocation order:\n"); 2039 pr_debug("Init section allocation order:\n");
1999 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2040 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2000 for (i = 0; i < info->hdr->e_shnum; ++i) { 2041 for (i = 0; i < info->hdr->e_shnum; ++i) {
2001 Elf_Shdr *s = &info->sechdrs[i]; 2042 Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2008 continue; 2049 continue;
2009 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 2050 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
2010 | INIT_OFFSET_MASK); 2051 | INIT_OFFSET_MASK);
2011 DEBUGP("\t%s\n", sname); 2052 pr_debug("\t%s\n", sname);
2012 } 2053 }
2013 switch (m) { 2054 switch (m) {
2014 case 0: /* executable */ 2055 case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2178 return true; 2219 return true;
2179} 2220}
2180 2221
2222/*
2223 * We only allocate and copy the strings needed by the parts of symtab
2224 * we keep. This is simple, but has the effect of making multiple
2225 * copies of duplicates. We could be more sophisticated, see
2226 * linux-kernel thread starting with
2227 * <73defb5e4bca04a6431392cc341112b1@localhost>.
2228 */
2181static void layout_symtab(struct module *mod, struct load_info *info) 2229static void layout_symtab(struct module *mod, struct load_info *info)
2182{ 2230{
2183 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2231 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2184 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2232 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2185 const Elf_Sym *src; 2233 const Elf_Sym *src;
2186 unsigned int i, nsrc, ndst; 2234 unsigned int i, nsrc, ndst, strtab_size;
2187 2235
2188 /* Put symbol section at end of init part of module. */ 2236 /* Put symbol section at end of init part of module. */
2189 symsect->sh_flags |= SHF_ALLOC; 2237 symsect->sh_flags |= SHF_ALLOC;
2190 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 2238 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
2191 info->index.sym) | INIT_OFFSET_MASK; 2239 info->index.sym) | INIT_OFFSET_MASK;
2192 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); 2240 pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
2193 2241
2194 src = (void *)info->hdr + symsect->sh_offset; 2242 src = (void *)info->hdr + symsect->sh_offset;
2195 nsrc = symsect->sh_size / sizeof(*src); 2243 nsrc = symsect->sh_size / sizeof(*src);
2196 for (ndst = i = 1; i < nsrc; ++i, ++src)
2197 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2198 unsigned int j = src->st_name;
2199 2244
2200 while (!__test_and_set_bit(j, info->strmap) 2245 /* Compute total space required for the core symbols' strtab. */
2201 && info->strtab[j]) 2246 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
2202 ++j; 2247 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2203 ++ndst; 2248 strtab_size += strlen(&info->strtab[src->st_name]) + 1;
2249 ndst++;
2204 } 2250 }
2205 2251
2206 /* Append room for core symbols at end of core part. */ 2252 /* Append room for core symbols at end of core part. */
2207 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2253 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
2208 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); 2254 info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
2255 mod->core_size += strtab_size;
2209 2256
2210 /* Put string table section at end of init part of module. */ 2257 /* Put string table section at end of init part of module. */
2211 strsect->sh_flags |= SHF_ALLOC; 2258 strsect->sh_flags |= SHF_ALLOC;
2212 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 2259 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
2213 info->index.str) | INIT_OFFSET_MASK; 2260 info->index.str) | INIT_OFFSET_MASK;
2214 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); 2261 pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
2215
2216 /* Append room for core symbols' strings at end of core part. */
2217 info->stroffs = mod->core_size;
2218 __set_bit(0, info->strmap);
2219 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
2220} 2262}
2221 2263
2222static void add_kallsyms(struct module *mod, const struct load_info *info) 2264static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2237 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); 2279 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2238 2280
2239 mod->core_symtab = dst = mod->module_core + info->symoffs; 2281 mod->core_symtab = dst = mod->module_core + info->symoffs;
2282 mod->core_strtab = s = mod->module_core + info->stroffs;
2240 src = mod->symtab; 2283 src = mod->symtab;
2241 *dst = *src; 2284 *dst = *src;
2285 *s++ = 0;
2242 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2286 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2243 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2287 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2244 continue; 2288 continue;
2289
2245 dst[ndst] = *src; 2290 dst[ndst] = *src;
2246 dst[ndst].st_name = bitmap_weight(info->strmap, 2291 dst[ndst++].st_name = s - mod->core_strtab;
2247 dst[ndst].st_name); 2292 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2248 ++ndst;
2249 } 2293 }
2250 mod->core_num_syms = ndst; 2294 mod->core_num_syms = ndst;
2251
2252 mod->core_strtab = s = mod->module_core + info->stroffs;
2253 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2254 if (test_bit(i, info->strmap))
2255 *++s = mod->strtab[i];
2256} 2295}
2257#else 2296#else
2258static inline void layout_symtab(struct module *mod, struct load_info *info) 2297static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2487,6 +2526,9 @@ static int check_modinfo(struct module *mod, struct load_info *info)
2487 return -ENOEXEC; 2526 return -ENOEXEC;
2488 } 2527 }
2489 2528
2529 if (!get_modinfo(info, "intree"))
2530 add_taint_module(mod, TAINT_OOT_MODULE);
2531
2490 if (get_modinfo(info, "staging")) { 2532 if (get_modinfo(info, "staging")) {
2491 add_taint_module(mod, TAINT_CRAP); 2533 add_taint_module(mod, TAINT_CRAP);
2492 printk(KERN_WARNING "%s: module is from the staging directory," 2534 printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2618,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
2618 mod->module_init = ptr; 2660 mod->module_init = ptr;
2619 2661
2620 /* Transfer each section which specifies SHF_ALLOC */ 2662 /* Transfer each section which specifies SHF_ALLOC */
2621 DEBUGP("final section addresses:\n"); 2663 pr_debug("final section addresses:\n");
2622 for (i = 0; i < info->hdr->e_shnum; i++) { 2664 for (i = 0; i < info->hdr->e_shnum; i++) {
2623 void *dest; 2665 void *dest;
2624 Elf_Shdr *shdr = &info->sechdrs[i]; 2666 Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2636,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
2636 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); 2678 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2637 /* Update sh_addr to point to copy in image. */ 2679 /* Update sh_addr to point to copy in image. */
2638 shdr->sh_addr = (unsigned long)dest; 2680 shdr->sh_addr = (unsigned long)dest;
2639 DEBUGP("\t0x%lx %s\n", 2681 pr_debug("\t0x%lx %s\n",
2640 shdr->sh_addr, info->secstrings + shdr->sh_name); 2682 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
2641 } 2683 }
2642 2684
2643 return 0; 2685 return 0;
@@ -2739,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
2739 this is done generically; there doesn't appear to be any 2781 this is done generically; there doesn't appear to be any
2740 special cases for the architectures. */ 2782 special cases for the architectures. */
2741 layout_sections(mod, info); 2783 layout_sections(mod, info);
2742
2743 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2744 * sizeof(long), GFP_KERNEL);
2745 if (!info->strmap) {
2746 err = -ENOMEM;
2747 goto free_percpu;
2748 }
2749 layout_symtab(mod, info); 2784 layout_symtab(mod, info);
2750 2785
2751 /* Allocate and move to the final place */ 2786 /* Allocate and move to the final place */
2752 err = move_module(mod, info); 2787 err = move_module(mod, info);
2753 if (err) 2788 if (err)
2754 goto free_strmap; 2789 goto free_percpu;
2755 2790
2756 /* Module has been copied to its final place now: return it. */ 2791 /* Module has been copied to its final place now: return it. */
2757 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2792 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2758 kmemleak_load_module(mod, info); 2793 kmemleak_load_module(mod, info);
2759 return mod; 2794 return mod;
2760 2795
2761free_strmap:
2762 kfree(info->strmap);
2763free_percpu: 2796free_percpu:
2764 percpu_modfree(mod); 2797 percpu_modfree(mod);
2765out: 2798out:
@@ -2769,7 +2802,6 @@ out:
2769/* mod is no longer valid after this! */ 2802/* mod is no longer valid after this! */
2770static void module_deallocate(struct module *mod, struct load_info *info) 2803static void module_deallocate(struct module *mod, struct load_info *info)
2771{ 2804{
2772 kfree(info->strmap);
2773 percpu_modfree(mod); 2805 percpu_modfree(mod);
2774 module_free(mod, mod->module_init); 2806 module_free(mod, mod->module_init);
2775 module_free(mod, mod->module_core); 2807 module_free(mod, mod->module_core);
@@ -2808,7 +2840,7 @@ static struct module *load_module(void __user *umod,
2808 struct module *mod; 2840 struct module *mod;
2809 long err; 2841 long err;
2810 2842
2811 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2843 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
2812 umod, len, uargs); 2844 umod, len, uargs);
2813 2845
2814 /* Copy in the blobs from userspace, check they are vaguely sane. */ 2846 /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2878,8 +2910,7 @@ static struct module *load_module(void __user *umod,
2878 } 2910 }
2879 2911
2880 /* This has to be done once we're sure module name is unique. */ 2912 /* This has to be done once we're sure module name is unique. */
2881 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) 2913 dynamic_debug_setup(info.debug, info.num_debug);
2882 dynamic_debug_setup(info.debug, info.num_debug);
2883 2914
2884 /* Find duplicate symbols */ 2915 /* Find duplicate symbols */
2885 err = verify_export_symbols(mod); 2916 err = verify_export_symbols(mod);
@@ -2900,8 +2931,7 @@ static struct module *load_module(void __user *umod,
2900 if (err < 0) 2931 if (err < 0)
2901 goto unlink; 2932 goto unlink;
2902 2933
2903 /* Get rid of temporary copy and strmap. */ 2934 /* Get rid of temporary copy. */
2904 kfree(info.strmap);
2905 free_copy(&info); 2935 free_copy(&info);
2906 2936
2907 /* Done! */ 2937 /* Done! */
@@ -2915,8 +2945,7 @@ static struct module *load_module(void __user *umod,
2915 module_bug_cleanup(mod); 2945 module_bug_cleanup(mod);
2916 2946
2917 ddebug: 2947 ddebug:
2918 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) 2948 dynamic_debug_remove(info.debug);
2919 dynamic_debug_remove(info.debug);
2920 unlock: 2949 unlock:
2921 mutex_unlock(&module_mutex); 2950 mutex_unlock(&module_mutex);
2922 synchronize_sched(); 2951 synchronize_sched();
@@ -3255,18 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
3255 mod->state == MODULE_STATE_GOING || 3284 mod->state == MODULE_STATE_GOING ||
3256 mod->state == MODULE_STATE_COMING) { 3285 mod->state == MODULE_STATE_COMING) {
3257 buf[bx++] = '('; 3286 buf[bx++] = '(';
3258 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) 3287 bx += module_flags_taint(mod, buf + bx);
3259 buf[bx++] = 'P';
3260 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3261 buf[bx++] = 'F';
3262 if (mod->taints & (1 << TAINT_CRAP))
3263 buf[bx++] = 'C';
3264 /*
3265 * TAINT_FORCED_RMMOD: could be added.
3266 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
3267 * apply to modules.
3268 */
3269
3270 /* Show a - for module-is-being-unloaded */ 3288 /* Show a - for module-is-being-unloaded */
3271 if (mod->state == MODULE_STATE_GOING) 3289 if (mod->state == MODULE_STATE_GOING)
3272 buf[bx++] = '-'; 3290 buf[bx++] = '-';
@@ -3487,50 +3505,3 @@ void module_layout(struct module *mod,
3487} 3505}
3488EXPORT_SYMBOL(module_layout); 3506EXPORT_SYMBOL(module_layout);
3489#endif 3507#endif
3490
3491#ifdef CONFIG_TRACEPOINTS
3492void module_update_tracepoints(void)
3493{
3494 struct module *mod;
3495
3496 mutex_lock(&module_mutex);
3497 list_for_each_entry(mod, &modules, list)
3498 if (!mod->taints)
3499 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3500 mod->tracepoints_ptrs + mod->num_tracepoints);
3501 mutex_unlock(&module_mutex);
3502}
3503
3504/*
3505 * Returns 0 if current not found.
3506 * Returns 1 if current found.
3507 */
3508int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3509{
3510 struct module *iter_mod;
3511 int found = 0;
3512
3513 mutex_lock(&module_mutex);
3514 list_for_each_entry(iter_mod, &modules, list) {
3515 if (!iter_mod->taints) {
3516 /*
3517 * Sorted module list
3518 */
3519 if (iter_mod < iter->module)
3520 continue;
3521 else if (iter_mod > iter->module)
3522 iter->tracepoint = NULL;
3523 found = tracepoint_get_iter_range(&iter->tracepoint,
3524 iter_mod->tracepoints_ptrs,
3525 iter_mod->tracepoints_ptrs
3526 + iter_mod->num_tracepoints);
3527 if (found) {
3528 iter->module = iter_mod;
3529 break;
3530 }
3531 }
3532 }
3533 mutex_unlock(&module_mutex);
3534 return found;
3535}
3536#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 73da83aff418..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
14 */ 14 */
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5dd441..89096dd8786f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/export.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/debug_locks.h> 25#include <linux/debug_locks.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 8d7b435806c9..2d5cc4ccff7f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
1#include <linux/kdebug.h> 1#include <linux/kdebug.h>
2#include <linux/kprobes.h> 2#include <linux/kprobes.h>
3#include <linux/module.h> 3#include <linux/export.h>
4#include <linux/notifier.h> 4#include <linux/notifier.h>
5#include <linux/rcupdate.h> 5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b98c64..b576f7f14bc6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
14 */ 14 */
15 15
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index b91941df5e63..b45259931512 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -18,7 +18,7 @@
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/export.h>
22#include <linux/cpumask.h> 22#include <linux/cpumask.h>
23#include <linux/err.h> 23#include <linux/err.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb6974efb5..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 49long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 50EXPORT_SYMBOL(panic_blink);
51 51
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
52/** 61/**
53 * panic - halt the system 62 * panic - halt the system
54 * @fmt: The text string to print 63 * @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
57 * 66 *
58 * This function never returns. 67 * This function never returns.
59 */ 68 */
60NORET_TYPE void panic(const char * fmt, ...) 69void panic(const char *fmt, ...)
61{ 70{
71 static DEFINE_SPINLOCK(panic_lock);
62 static char buf[1024]; 72 static char buf[1024];
63 va_list args; 73 va_list args;
64 long i, i_next = 0; 74 long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
68 * It's possible to come here directly from a panic-assertion and 78 * It's possible to come here directly from a panic-assertion and
69 * not have preempt disabled. Some functions called from here want 79 * not have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though... 80 * preempt to be disabled. No point enabling it later though...
81 *
82 * Only one CPU is allowed to execute the panic code from here. For
83 * multiple parallel invocations of panic, all other CPUs either
84 * stop themself or will wait until they are stopped by the 1st CPU
85 * with smp_send_stop().
71 */ 86 */
72 preempt_disable(); 87 if (!spin_trylock(&panic_lock))
88 panic_smp_self_stop();
73 89
74 console_verbose(); 90 console_verbose();
75 bust_spinlocks(1); 91 bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
78 va_end(args); 94 va_end(args);
79 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 95 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
80#ifdef CONFIG_DEBUG_BUGVERBOSE 96#ifdef CONFIG_DEBUG_BUGVERBOSE
81 dump_stack(); 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */
100 if (!oops_in_progress)
101 dump_stack();
82#endif 102#endif
83 103
84 /* 104 /*
@@ -177,6 +197,7 @@ static const struct tnt tnts[] = {
177 { TAINT_WARN, 'W', ' ' }, 197 { TAINT_WARN, 'W', ' ' },
178 { TAINT_CRAP, 'C', ' ' }, 198 { TAINT_CRAP, 'C', ' ' },
179 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 199 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
200 { TAINT_OOT_MODULE, 'O', ' ' },
180}; 201};
181 202
182/** 203/**
@@ -194,6 +215,7 @@ static const struct tnt tnts[] = {
194 * 'W' - Taint on warning. 215 * 'W' - Taint on warning.
195 * 'C' - modules from drivers/staging are loaded. 216 * 'C' - modules from drivers/staging are loaded.
196 * 'I' - Working around severe firmware bug. 217 * 'I' - Working around severe firmware bug.
218 * 'O' - Out-of-tree module has been loaded.
197 * 219 *
198 * The string is overwritten by the next call to print_tainted(). 220 * The string is overwritten by the next call to print_tainted().
199 */ 221 */
@@ -235,11 +257,20 @@ void add_taint(unsigned flag)
235 * Can't trust the integrity of the kernel anymore. 257 * Can't trust the integrity of the kernel anymore.
236 * We don't call directly debug_locks_off() because the issue 258 * We don't call directly debug_locks_off() because the issue
237 * is not necessarily serious enough to set oops_in_progress to 1 259 * is not necessarily serious enough to set oops_in_progress to 1
238 * Also we want to keep up lockdep for staging development and 260 * Also we want to keep up lockdep for staging/out-of-tree
239 * post-warning case. 261 * development and post-warning case.
240 */ 262 */
241 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) 263 switch (flag) {
242 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); 264 case TAINT_CRAP:
265 case TAINT_OOT_MODULE:
266 case TAINT_WARN:
267 case TAINT_FIRMWARE_WORKAROUND:
268 break;
269
270 default:
271 if (__debug_locks_off())
272 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
273 }
243 274
244 set_bit(flag, &tainted_mask); 275 set_bit(flag, &tainted_mask);
245} 276}
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142a..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,7 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/moduleparam.h> 18#include <linux/module.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
@@ -25,12 +25,6 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27 27
28#if 0
29#define DEBUGP printk
30#else
31#define DEBUGP(fmt, a...)
32#endif
33
34/* Protects all parameters, and incidentally kmalloced_param list. */ 28/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock); 29static DEFINE_MUTEX(param_lock);
36 30
@@ -67,20 +61,27 @@ static void maybe_kfree_parameter(void *param)
67 } 61 }
68} 62}
69 63
70static inline char dash2underscore(char c) 64static char dash2underscore(char c)
71{ 65{
72 if (c == '-') 66 if (c == '-')
73 return '_'; 67 return '_';
74 return c; 68 return c;
75} 69}
76 70
77static inline int parameq(const char *input, const char *paramname) 71bool parameqn(const char *a, const char *b, size_t n)
78{ 72{
79 unsigned int i; 73 size_t i;
80 for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) 74
81 if (input[i] == '\0') 75 for (i = 0; i < n; i++) {
82 return 1; 76 if (dash2underscore(a[i]) != dash2underscore(b[i]))
83 return 0; 77 return false;
78 }
79 return true;
80}
81
82bool parameq(const char *a, const char *b)
83{
84 return parameqn(a, b, strlen(a)+1);
84} 85}
85 86
86static int parse_one(char *param, 87static int parse_one(char *param,
@@ -98,7 +99,7 @@ static int parse_one(char *param,
98 /* No one handled NULL, so do it here. */ 99 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 100 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 101 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 102 pr_debug("They are equal! Calling %p\n",
102 params[i].ops->set); 103 params[i].ops->set);
103 mutex_lock(&param_lock); 104 mutex_lock(&param_lock);
104 err = params[i].ops->set(val, &params[i]); 105 err = params[i].ops->set(val, &params[i]);
@@ -108,11 +109,11 @@ static int parse_one(char *param,
108 } 109 }
109 110
110 if (handle_unknown) { 111 if (handle_unknown) {
111 DEBUGP("Unknown argument: calling %p\n", handle_unknown); 112 pr_debug("Unknown argument: calling %p\n", handle_unknown);
112 return handle_unknown(param, val); 113 return handle_unknown(param, val);
113 } 114 }
114 115
115 DEBUGP("Unknown argument `%s'\n", param); 116 pr_debug("Unknown argument `%s'\n", param);
116 return -ENOENT; 117 return -ENOENT;
117} 118}
118 119
@@ -177,7 +178,7 @@ int parse_args(const char *name,
177{ 178{
178 char *param, *val; 179 char *param, *val;
179 180
180 DEBUGP("Parsing ARGS: %s\n", args); 181 pr_debug("Parsing ARGS: %s\n", args);
181 182
182 /* Chew leading spaces */ 183 /* Chew leading spaces */
183 args = skip_spaces(args); 184 args = skip_spaces(args);
@@ -362,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
362}; 363};
363EXPORT_SYMBOL(param_ops_invbool); 364EXPORT_SYMBOL(param_ops_invbool);
364 365
366int param_set_bint(const char *val, const struct kernel_param *kp)
367{
368 struct kernel_param boolkp;
369 bool v;
370 int ret;
371
372 /* Match bool exactly, by re-using it. */
373 boolkp = *kp;
374 boolkp.arg = &v;
375 boolkp.flags |= KPARAM_ISBOOL;
376
377 ret = param_set_bool(val, &boolkp);
378 if (ret == 0)
379 *(int *)kp->arg = v;
380 return ret;
381}
382EXPORT_SYMBOL(param_set_bint);
383
384struct kernel_param_ops param_ops_bint = {
385 .set = param_set_bint,
386 .get = param_get_int,
387};
388EXPORT_SYMBOL(param_ops_bint);
389
365/* We break the rule and mangle the string. */ 390/* We break the rule and mangle the string. */
366static int param_array(const char *name, 391static int param_array(const char *name,
367 const char *val, 392 const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b21..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,7 +27,7 @@
27 */ 27 */
28 28
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/rculist.h> 33#include <linux/rculist.h>
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
137} 137}
138 138
139/* 139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid
141 * at the pid allocation time (there's also a sysctl for this, but racing
142 * with this one is OK, see comment in kernel/pid_namespace.c about it).
141 * We want the winner to have the "later" value, because if the 143 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 144 * "earlier" value prevails, then a pid may get reused immediately.
143 * 145 *
@@ -418,7 +420,9 @@ EXPORT_SYMBOL(pid_task);
418 */ 420 */
419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 421struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
420{ 422{
421 rcu_lockdep_assert(rcu_read_lock_held()); 423 rcu_lockdep_assert(rcu_read_lock_held(),
424 "find_task_by_pid_ns() needs rcu_read_lock()"
425 " protection");
422 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 426 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
423} 427}
424 428
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 191 return;
192} 192}
193 193
194static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos)
196{
197 struct ctl_table tmp = *table;
198
199 if (write && !capable(CAP_SYS_ADMIN))
200 return -EPERM;
201
202 /*
203 * Writing directly to ns' last_pid field is OK, since this field
204 * is volatile in a living namespace anyway and a code writing to
205 * it should synchronize its usage with external means.
206 */
207
208 tmp.data = &current->nsproxy->pid_ns->last_pid;
209 return proc_dointvec(&tmp, write, buffer, lenp, ppos);
210}
211
212static struct ctl_table pid_ns_ctl_table[] = {
213 {
214 .procname = "ns_last_pid",
215 .maxlen = sizeof(int),
216 .mode = 0666, /* permissions are checked in the handler */
217 .proc_handler = pid_ns_ctl_handler,
218 },
219 { }
220};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223
194static __init int pid_namespaces_init(void) 224static __init int pid_namespaces_init(void)
195{ 225{
196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
227 register_sysctl_paths(kern_path, pid_ns_ctl_table);
197 return 0; 228 return 0;
198} 229}
199 230
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 640ded8f5c48..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -282,13 +267,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
282 * it. 267 * it.
283 */ 268 */
284 thread_group_cputime(tsk, &sum); 269 thread_group_cputime(tsk, &sum);
285 spin_lock_irqsave(&cputimer->lock, flags); 270 raw_spin_lock_irqsave(&cputimer->lock, flags);
286 cputimer->running = 1; 271 cputimer->running = 1;
287 update_gt_cputime(&cputimer->cputime, &sum); 272 update_gt_cputime(&cputimer->cputime, &sum);
288 } else 273 } else
289 spin_lock_irqsave(&cputimer->lock, flags); 274 raw_spin_lock_irqsave(&cputimer->lock, flags);
290 *times = cputimer->cputime; 275 *times = cputimer->cputime;
291 spin_unlock_irqrestore(&cputimer->lock, flags); 276 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
292} 277}
293 278
294/* 279/*
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -999,9 +980,9 @@ static void stop_process_timers(struct signal_struct *sig)
999 struct thread_group_cputimer *cputimer = &sig->cputimer; 980 struct thread_group_cputimer *cputimer = &sig->cputimer;
1000 unsigned long flags; 981 unsigned long flags;
1001 982
1002 spin_lock_irqsave(&cputimer->lock, flags); 983 raw_spin_lock_irqsave(&cputimer->lock, flags);
1003 cputimer->running = 0; 984 cputimer->running = 0;
1004 spin_unlock_irqrestore(&cputimer->lock, flags); 985 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
1005} 986}
1006 987
1007static u32 onecputick; 988static u32 onecputick;
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1291,9 +1263,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1291 if (sig->cputimer.running) { 1263 if (sig->cputimer.running) {
1292 struct task_cputime group_sample; 1264 struct task_cputime group_sample;
1293 1265
1294 spin_lock(&sig->cputimer.lock); 1266 raw_spin_lock(&sig->cputimer.lock);
1295 group_sample = sig->cputimer.cputime; 1267 group_sample = sig->cputimer.cputime;
1296 spin_unlock(&sig->cputimer.lock); 1268 raw_spin_unlock(&sig->cputimer.lock);
1297 1269
1298 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1270 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1299 return 1; 1271 return 1;
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f3..69185ae6b701 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/module.h> 49#include <linux/export.h>
50 50
51/* 51/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 52 * Management arrays for POSIX timers. Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3744c594b19b..deb5461e3216 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
27 select HIBERNATE_CALLBACKS 27 select HIBERNATE_CALLBACKS
28 select LZO_COMPRESS 28 select LZO_COMPRESS
29 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
30 select CRC32
30 ---help--- 31 ---help---
31 Enable the suspend to disk (STD) functionality, which is usually 32 Enable the suspend to disk (STD) functionality, which is usually
32 called "hibernation" in user interfaces. STD checkpoints the 33 called "hibernation" in user interfaces. STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
65 66
66 For more information take a look at <file:Documentation/power/swsusp.txt>. 67 For more information take a look at <file:Documentation/power/swsusp.txt>.
67 68
69config ARCH_SAVE_PAGE_KEYS
70 bool
71
68config PM_STD_PARTITION 72config PM_STD_PARTITION
69 string "Default resume partition" 73 string "Default resume partition"
70 depends on HIBERNATION 74 depends on HIBERNATION
@@ -235,3 +239,7 @@ config PM_GENERIC_DOMAINS
235config PM_GENERIC_DOMAINS_RUNTIME 239config PM_GENERIC_DOMAINS_RUNTIME
236 def_bool y 240 def_bool y
237 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 241 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
242
243config CPU_PM
244 bool
245 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a90643..07e0e28ffba7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o qos.o
5obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 6obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 7obj-$(CONFIG_SUSPEND) += suspend.o
8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af90156..b1dc456474b5 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * drivers/power/process.c - Functions for saving/restoring console. 2 * Functions for saving/restoring console.
3 * 3 *
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
@@ -10,7 +10,6 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 14
16static int orig_fgconsole, orig_kmsg; 15static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
32 vt_kmsg_redirect(orig_kmsg); 31 vt_kmsg_redirect(orig_kmsg);
33 } 32 }
34} 33}
35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece1..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,11 +9,13 @@
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
10 */ 10 */
11 11
12#include <linux/export.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/reboot.h> 15#include <linux/reboot.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/device.h> 17#include <linux/device.h>
18#include <linux/async.h>
17#include <linux/kmod.h> 19#include <linux/kmod.h>
18#include <linux/delay.h> 20#include <linux/delay.h>
19#include <linux/fs.h> 21#include <linux/fs.h>
@@ -29,18 +31,18 @@
29#include "power.h" 31#include "power.h"
30 32
31 33
32static int nocompress = 0; 34static int nocompress;
33static int noresume = 0; 35static int noresume;
36static int resume_wait;
37static int resume_delay;
34static char resume_file[256] = CONFIG_PM_STD_PARTITION; 38static char resume_file[256] = CONFIG_PM_STD_PARTITION;
35dev_t swsusp_resume_device; 39dev_t swsusp_resume_device;
36sector_t swsusp_resume_block; 40sector_t swsusp_resume_block;
37int in_suspend __nosavedata = 0; 41int in_suspend __nosavedata;
38 42
39enum { 43enum {
40 HIBERNATION_INVALID, 44 HIBERNATION_INVALID,
41 HIBERNATION_PLATFORM, 45 HIBERNATION_PLATFORM,
42 HIBERNATION_TEST,
43 HIBERNATION_TESTPROC,
44 HIBERNATION_SHUTDOWN, 46 HIBERNATION_SHUTDOWN,
45 HIBERNATION_REBOOT, 47 HIBERNATION_REBOOT,
46 /* keep last */ 48 /* keep last */
@@ -51,6 +53,8 @@ enum {
51 53
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 54static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 55
56bool freezer_test_done;
57
54static const struct platform_hibernation_ops *hibernation_ops; 58static const struct platform_hibernation_ops *hibernation_ops;
55 59
56/** 60/**
@@ -65,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
65 WARN_ON(1); 69 WARN_ON(1);
66 return; 70 return;
67 } 71 }
68 mutex_lock(&pm_mutex); 72 lock_system_sleep();
69 hibernation_ops = ops; 73 hibernation_ops = ops;
70 if (ops) 74 if (ops)
71 hibernation_mode = HIBERNATION_PLATFORM; 75 hibernation_mode = HIBERNATION_PLATFORM;
72 else if (hibernation_mode == HIBERNATION_PLATFORM) 76 else if (hibernation_mode == HIBERNATION_PLATFORM)
73 hibernation_mode = HIBERNATION_SHUTDOWN; 77 hibernation_mode = HIBERNATION_SHUTDOWN;
74 78
75 mutex_unlock(&pm_mutex); 79 unlock_system_sleep();
76} 80}
77 81
78static bool entering_platform_hibernation; 82static bool entering_platform_hibernation;
@@ -90,15 +94,6 @@ static void hibernation_debug_sleep(void)
90 mdelay(5000); 94 mdelay(5000);
91} 95}
92 96
93static int hibernation_testmode(int mode)
94{
95 if (hibernation_mode == mode) {
96 hibernation_debug_sleep();
97 return 1;
98 }
99 return 0;
100}
101
102static int hibernation_test(int level) 97static int hibernation_test(int level)
103{ 98{
104 if (pm_test_level == level) { 99 if (pm_test_level == level) {
@@ -108,7 +103,6 @@ static int hibernation_test(int level)
108 return 0; 103 return 0;
109} 104}
110#else /* !CONFIG_PM_DEBUG */ 105#else /* !CONFIG_PM_DEBUG */
111static int hibernation_testmode(int mode) { return 0; }
112static int hibernation_test(int level) { return 0; } 106static int hibernation_test(int level) { return 0; }
113#endif /* !CONFIG_PM_DEBUG */ 107#endif /* !CONFIG_PM_DEBUG */
114 108
@@ -272,8 +266,7 @@ static int create_image(int platform_mode)
272 goto Platform_finish; 266 goto Platform_finish;
273 267
274 error = disable_nonboot_cpus(); 268 error = disable_nonboot_cpus();
275 if (error || hibernation_test(TEST_CPUS) 269 if (error || hibernation_test(TEST_CPUS))
276 || hibernation_testmode(HIBERNATION_TEST))
277 goto Enable_cpus; 270 goto Enable_cpus;
278 271
279 local_irq_disable(); 272 local_irq_disable();
@@ -327,38 +320,54 @@ static int create_image(int platform_mode)
327 */ 320 */
328int hibernation_snapshot(int platform_mode) 321int hibernation_snapshot(int platform_mode)
329{ 322{
330 pm_message_t msg = PMSG_RECOVER; 323 pm_message_t msg;
331 int error; 324 int error;
332 325
333 error = platform_begin(platform_mode); 326 error = platform_begin(platform_mode);
334 if (error) 327 if (error)
335 goto Close; 328 goto Close;
336 329
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
341 /* Preallocate image memory before shutting down devices. */ 330 /* Preallocate image memory before shutting down devices. */
342 error = hibernate_preallocate_memory(); 331 error = hibernate_preallocate_memory();
343 if (error) 332 if (error)
344 goto Complete_devices; 333 goto Close;
334
335 error = freeze_kernel_threads();
336 if (error)
337 goto Cleanup;
338
339 if (hibernation_test(TEST_FREEZER)) {
340
341 /*
342 * Indicate to the caller that we are returning due to a
343 * successful freezer test.
344 */
345 freezer_test_done = true;
346 goto Cleanup;
347 }
348
349 error = dpm_prepare(PMSG_FREEZE);
350 if (error) {
351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup;
353 }
345 354
346 suspend_console(); 355 suspend_console();
347 pm_restrict_gfp_mask(); 356 pm_restrict_gfp_mask();
357
348 error = dpm_suspend(PMSG_FREEZE); 358 error = dpm_suspend(PMSG_FREEZE);
349 if (error)
350 goto Recover_platform;
351 359
352 if (hibernation_test(TEST_DEVICES)) 360 if (error || hibernation_test(TEST_DEVICES))
353 goto Recover_platform; 361 platform_recover(platform_mode);
362 else
363 error = create_image(platform_mode);
354 364
355 error = create_image(platform_mode);
356 /* 365 /*
357 * Control returns here (1) after the image has been created or the 366 * In the case that we call create_image() above, the control
367 * returns here (1) after the image has been created or the
358 * image creation has failed and (2) after a successful restore. 368 * image creation has failed and (2) after a successful restore.
359 */ 369 */
360 370
361 Resume_devices:
362 /* We may need to release the preallocated image pages here. */ 371 /* We may need to release the preallocated image pages here. */
363 if (error || !in_suspend) 372 if (error || !in_suspend)
364 swsusp_free(); 373 swsusp_free();
@@ -370,17 +379,15 @@ int hibernation_snapshot(int platform_mode)
370 pm_restore_gfp_mask(); 379 pm_restore_gfp_mask();
371 380
372 resume_console(); 381 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg); 382 dpm_complete(msg);
376 383
377 Close: 384 Close:
378 platform_end(platform_mode); 385 platform_end(platform_mode);
379 return error; 386 return error;
380 387
381 Recover_platform: 388 Cleanup:
382 platform_recover(platform_mode); 389 swsusp_free();
383 goto Resume_devices; 390 goto Close;
384} 391}
385 392
386/** 393/**
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode)
463 * @platform_mode: If set, use platform driver to prepare for the transition. 470 * @platform_mode: If set, use platform driver to prepare for the transition.
464 * 471 *
465 * This routine must be called with pm_mutex held. If it is successful, control 472 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot(). 473 * reappears in the restored target kernel in hibernation_snapshot().
467 */ 474 */
468int hibernation_restore(int platform_mode) 475int hibernation_restore(int platform_mode)
469{ 476{
@@ -565,9 +572,6 @@ int hibernation_platform_enter(void)
565static void power_down(void) 572static void power_down(void)
566{ 573{
567 switch (hibernation_mode) { 574 switch (hibernation_mode) {
568 case HIBERNATION_TEST:
569 case HIBERNATION_TESTPROC:
570 break;
571 case HIBERNATION_REBOOT: 575 case HIBERNATION_REBOOT:
572 kernel_restart(NULL); 576 kernel_restart(NULL);
573 break; 577 break;
@@ -586,17 +590,6 @@ static void power_down(void)
586 while(1); 590 while(1);
587} 591}
588 592
589static int prepare_processes(void)
590{
591 int error = 0;
592
593 if (freeze_processes()) {
594 error = -EBUSY;
595 thaw_processes();
596 }
597 return error;
598}
599
600/** 593/**
601 * hibernate - Carry out system hibernation, including saving the image. 594 * hibernate - Carry out system hibernation, including saving the image.
602 */ 595 */
@@ -604,7 +597,7 @@ int hibernate(void)
604{ 597{
605 int error; 598 int error;
606 599
607 mutex_lock(&pm_mutex); 600 lock_system_sleep();
608 /* The snapshot device should not be opened while we're running */ 601 /* The snapshot device should not be opened while we're running */
609 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 602 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
610 error = -EBUSY; 603 error = -EBUSY;
@@ -629,19 +622,17 @@ int hibernate(void)
629 sys_sync(); 622 sys_sync();
630 printk("done.\n"); 623 printk("done.\n");
631 624
632 error = prepare_processes(); 625 error = freeze_processes();
633 if (error) 626 if (error)
634 goto Finish; 627 goto Finish;
635 628
636 if (hibernation_test(TEST_FREEZER))
637 goto Thaw;
638
639 if (hibernation_testmode(HIBERNATION_TESTPROC))
640 goto Thaw;
641
642 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 629 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
643 if (error) 630 if (error)
644 goto Thaw; 631 goto Thaw;
632 if (freezer_test_done) {
633 freezer_test_done = false;
634 goto Thaw;
635 }
645 636
646 if (in_suspend) { 637 if (in_suspend) {
647 unsigned int flags = 0; 638 unsigned int flags = 0;
@@ -650,6 +641,9 @@ int hibernate(void)
650 flags |= SF_PLATFORM_MODE; 641 flags |= SF_PLATFORM_MODE;
651 if (nocompress) 642 if (nocompress)
652 flags |= SF_NOCOMPRESS_MODE; 643 flags |= SF_NOCOMPRESS_MODE;
644 else
645 flags |= SF_CRC32_MODE;
646
653 pr_debug("PM: writing image.\n"); 647 pr_debug("PM: writing image.\n");
654 error = swsusp_write(flags); 648 error = swsusp_write(flags);
655 swsusp_free(); 649 swsusp_free();
@@ -671,7 +665,7 @@ int hibernate(void)
671 pm_restore_console(); 665 pm_restore_console();
672 atomic_inc(&snapshot_device_available); 666 atomic_inc(&snapshot_device_available);
673 Unlock: 667 Unlock:
674 mutex_unlock(&pm_mutex); 668 unlock_system_sleep();
675 return error; 669 return error;
676} 670}
677 671
@@ -724,6 +718,12 @@ static int software_resume(void)
724 718
725 pr_debug("PM: Checking hibernation image partition %s\n", resume_file); 719 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
726 720
721 if (resume_delay) {
722 printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
723 resume_delay);
724 ssleep(resume_delay);
725 }
726
727 /* Check if the device is there */ 727 /* Check if the device is there */
728 swsusp_resume_device = name_to_dev_t(resume_file); 728 swsusp_resume_device = name_to_dev_t(resume_file);
729 if (!swsusp_resume_device) { 729 if (!swsusp_resume_device) {
@@ -732,6 +732,13 @@ static int software_resume(void)
732 * to wait for this to finish. 732 * to wait for this to finish.
733 */ 733 */
734 wait_for_device_probe(); 734 wait_for_device_probe();
735
736 if (resume_wait) {
737 while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
738 msleep(10);
739 async_synchronize_full();
740 }
741
735 /* 742 /*
736 * We can't depend on SCSI devices being available after loading 743 * We can't depend on SCSI devices being available after loading
737 * one of their modules until scsi_complete_async_scans() is 744 * one of their modules until scsi_complete_async_scans() is
@@ -772,11 +779,13 @@ static int software_resume(void)
772 goto close_finish; 779 goto close_finish;
773 780
774 error = create_basic_memory_bitmaps(); 781 error = create_basic_memory_bitmaps();
775 if (error) 782 if (error) {
783 usermodehelper_enable();
776 goto close_finish; 784 goto close_finish;
785 }
777 786
778 pr_debug("PM: Preparing processes for restore.\n"); 787 pr_debug("PM: Preparing processes for restore.\n");
779 error = prepare_processes(); 788 error = freeze_processes();
780 if (error) { 789 if (error) {
781 swsusp_close(FMODE_READ); 790 swsusp_close(FMODE_READ);
782 goto Done; 791 goto Done;
@@ -816,8 +825,6 @@ static const char * const hibernation_modes[] = {
816 [HIBERNATION_PLATFORM] = "platform", 825 [HIBERNATION_PLATFORM] = "platform",
817 [HIBERNATION_SHUTDOWN] = "shutdown", 826 [HIBERNATION_SHUTDOWN] = "shutdown",
818 [HIBERNATION_REBOOT] = "reboot", 827 [HIBERNATION_REBOOT] = "reboot",
819 [HIBERNATION_TEST] = "test",
820 [HIBERNATION_TESTPROC] = "testproc",
821}; 828};
822 829
823/* 830/*
@@ -826,17 +833,15 @@ static const char * const hibernation_modes[] = {
826 * Hibernation can be handled in several ways. There are a few different ways 833 * Hibernation can be handled in several ways. There are a few different ways
827 * to put the system into the sleep state: using the platform driver (e.g. ACPI 834 * to put the system into the sleep state: using the platform driver (e.g. ACPI
828 * or other hibernation_ops), powering it off or rebooting it (for testing 835 * or other hibernation_ops), powering it off or rebooting it (for testing
829 * mostly), or using one of the two available test modes. 836 * mostly).
830 * 837 *
831 * The sysfs file /sys/power/disk provides an interface for selecting the 838 * The sysfs file /sys/power/disk provides an interface for selecting the
832 * hibernation mode to use. Reading from this file causes the available modes 839 * hibernation mode to use. Reading from this file causes the available modes
833 * to be printed. There are 5 modes that can be supported: 840 * to be printed. There are 3 modes that can be supported:
834 * 841 *
835 * 'platform' 842 * 'platform'
836 * 'shutdown' 843 * 'shutdown'
837 * 'reboot' 844 * 'reboot'
838 * 'test'
839 * 'testproc'
840 * 845 *
841 * If a platform hibernation driver is in use, 'platform' will be supported 846 * If a platform hibernation driver is in use, 'platform' will be supported
842 * and will be used by default. Otherwise, 'shutdown' will be used by default. 847 * and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -860,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
860 switch (i) { 865 switch (i) {
861 case HIBERNATION_SHUTDOWN: 866 case HIBERNATION_SHUTDOWN:
862 case HIBERNATION_REBOOT: 867 case HIBERNATION_REBOOT:
863 case HIBERNATION_TEST:
864 case HIBERNATION_TESTPROC:
865 break; 868 break;
866 case HIBERNATION_PLATFORM: 869 case HIBERNATION_PLATFORM:
867 if (hibernation_ops) 870 if (hibernation_ops)
@@ -890,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
890 p = memchr(buf, '\n', n); 893 p = memchr(buf, '\n', n);
891 len = p ? p - buf : n; 894 len = p ? p - buf : n;
892 895
893 mutex_lock(&pm_mutex); 896 lock_system_sleep();
894 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 897 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
895 if (len == strlen(hibernation_modes[i]) 898 if (len == strlen(hibernation_modes[i])
896 && !strncmp(buf, hibernation_modes[i], len)) { 899 && !strncmp(buf, hibernation_modes[i], len)) {
@@ -902,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
902 switch (mode) { 905 switch (mode) {
903 case HIBERNATION_SHUTDOWN: 906 case HIBERNATION_SHUTDOWN:
904 case HIBERNATION_REBOOT: 907 case HIBERNATION_REBOOT:
905 case HIBERNATION_TEST:
906 case HIBERNATION_TESTPROC:
907 hibernation_mode = mode; 908 hibernation_mode = mode;
908 break; 909 break;
909 case HIBERNATION_PLATFORM: 910 case HIBERNATION_PLATFORM:
@@ -918,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
918 if (!error) 919 if (!error)
919 pr_debug("PM: Hibernation mode set to '%s'\n", 920 pr_debug("PM: Hibernation mode set to '%s'\n",
920 hibernation_modes[mode]); 921 hibernation_modes[mode]);
921 mutex_unlock(&pm_mutex); 922 unlock_system_sleep();
922 return error ? error : n; 923 return error ? error : n;
923} 924}
924 925
@@ -945,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
945 if (maj != MAJOR(res) || min != MINOR(res)) 946 if (maj != MAJOR(res) || min != MINOR(res))
946 goto out; 947 goto out;
947 948
948 mutex_lock(&pm_mutex); 949 lock_system_sleep();
949 swsusp_resume_device = res; 950 swsusp_resume_device = res;
950 mutex_unlock(&pm_mutex); 951 unlock_system_sleep();
951 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 952 printk(KERN_INFO "PM: Starting manual resume from disk\n");
952 noresume = 0; 953 noresume = 0;
953 software_resume(); 954 software_resume();
@@ -1060,7 +1061,21 @@ static int __init noresume_setup(char *str)
1060 return 1; 1061 return 1;
1061} 1062}
1062 1063
1064static int __init resumewait_setup(char *str)
1065{
1066 resume_wait = 1;
1067 return 1;
1068}
1069
1070static int __init resumedelay_setup(char *str)
1071{
1072 resume_delay = simple_strtoul(str, NULL, 0);
1073 return 1;
1074}
1075
1063__setup("noresume", noresume_setup); 1076__setup("noresume", noresume_setup);
1064__setup("resume_offset=", resume_offset_setup); 1077__setup("resume_offset=", resume_offset_setup);
1065__setup("resume=", resume_setup); 1078__setup("resume=", resume_setup);
1066__setup("hibernate=", hibernate_setup); 1079__setup("hibernate=", hibernate_setup);
1080__setup("resumewait", resumewait_setup);
1081__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f871964..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,15 +3,18 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * 6 *
7 * This file is released under the GPLv2 7 * This file is released under the GPLv2
8 * 8 *
9 */ 9 */
10 10
11#include <linux/export.h>
11#include <linux/kobject.h> 12#include <linux/kobject.h>
12#include <linux/string.h> 13#include <linux/string.h>
13#include <linux/resume-trace.h> 14#include <linux/resume-trace.h>
14#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/debugfs.h>
17#include <linux/seq_file.h>
15 18
16#include "power.h" 19#include "power.h"
17 20
@@ -113,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
113 p = memchr(buf, '\n', n); 116 p = memchr(buf, '\n', n);
114 len = p ? p - buf : n; 117 len = p ? p - buf : n;
115 118
116 mutex_lock(&pm_mutex); 119 lock_system_sleep();
117 120
118 level = TEST_FIRST; 121 level = TEST_FIRST;
119 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) 122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -123,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
123 break; 126 break;
124 } 127 }
125 128
126 mutex_unlock(&pm_mutex); 129 unlock_system_sleep();
127 130
128 return error ? error : n; 131 return error ? error : n;
129} 132}
@@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
131power_attr(pm_test); 134power_attr(pm_test);
132#endif /* CONFIG_PM_DEBUG */ 135#endif /* CONFIG_PM_DEBUG */
133 136
137#ifdef CONFIG_DEBUG_FS
138static char *suspend_step_name(enum suspend_stat_step step)
139{
140 switch (step) {
141 case SUSPEND_FREEZE:
142 return "freeze";
143 case SUSPEND_PREPARE:
144 return "prepare";
145 case SUSPEND_SUSPEND:
146 return "suspend";
147 case SUSPEND_SUSPEND_NOIRQ:
148 return "suspend_noirq";
149 case SUSPEND_RESUME_NOIRQ:
150 return "resume_noirq";
151 case SUSPEND_RESUME:
152 return "resume";
153 default:
154 return "";
155 }
156}
157
158static int suspend_stats_show(struct seq_file *s, void *unused)
159{
160 int i, index, last_dev, last_errno, last_step;
161
162 last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
163 last_dev %= REC_FAILED_NUM;
164 last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume,
178 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
181 suspend_stats.failed_devs[last_dev]);
182 for (i = 1; i < REC_FAILED_NUM; i++) {
183 index = last_dev + REC_FAILED_NUM - i;
184 index %= REC_FAILED_NUM;
185 seq_printf(s, "\t\t\t%-s\n",
186 suspend_stats.failed_devs[index]);
187 }
188 seq_printf(s, " last_failed_errno:\t%-d\n",
189 suspend_stats.errno[last_errno]);
190 for (i = 1; i < REC_FAILED_NUM; i++) {
191 index = last_errno + REC_FAILED_NUM - i;
192 index %= REC_FAILED_NUM;
193 seq_printf(s, "\t\t\t%-d\n",
194 suspend_stats.errno[index]);
195 }
196 seq_printf(s, " last_failed_step:\t%-s\n",
197 suspend_step_name(
198 suspend_stats.failed_steps[last_step]));
199 for (i = 1; i < REC_FAILED_NUM; i++) {
200 index = last_step + REC_FAILED_NUM - i;
201 index %= REC_FAILED_NUM;
202 seq_printf(s, "\t\t\t%-s\n",
203 suspend_step_name(
204 suspend_stats.failed_steps[index]));
205 }
206
207 return 0;
208}
209
210static int suspend_stats_open(struct inode *inode, struct file *file)
211{
212 return single_open(file, suspend_stats_show, NULL);
213}
214
215static const struct file_operations suspend_stats_operations = {
216 .open = suspend_stats_open,
217 .read = seq_read,
218 .llseek = seq_lseek,
219 .release = single_release,
220};
221
222static int __init pm_debugfs_init(void)
223{
224 debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
225 NULL, NULL, &suspend_stats_operations);
226 return 0;
227}
228
229late_initcall(pm_debugfs_init);
230#endif /* CONFIG_DEBUG_FS */
231
134#endif /* CONFIG_PM_SLEEP */ 232#endif /* CONFIG_PM_SLEEP */
135 233
136struct kobject *power_kobj; 234struct kobject *power_kobj;
@@ -142,7 +240,7 @@ struct kobject *power_kobj;
142 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
143 * 'disk' (Suspend-to-Disk). 241 * 'disk' (Suspend-to-Disk).
144 * 242 *
145 * store() accepts one of those strings, translates it into the 243 * store() accepts one of those strings, translates it into the
146 * proper enumerated value, and initiates a suspend transition. 244 * proper enumerated value, and initiates a suspend transition.
147 */ 245 */
148static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -184,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
184 /* First, check if we are requested to hibernate */ 282 /* First, check if we are requested to hibernate */
185 if (len == 4 && !strncmp(buf, "disk", len)) { 283 if (len == 4 && !strncmp(buf, "disk", len)) {
186 error = hibernate(); 284 error = hibernate();
187 goto Exit; 285 goto Exit;
188 } 286 }
189 287
190#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
@@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
192 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
193 break; 291 break;
194 } 292 }
195 if (state < PM_SUSPEND_MAX && *s) 293 if (state < PM_SUSPEND_MAX && *s) {
196 error = enter_state(state); 294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 }
197#endif 301#endif
198 302
199 Exit: 303 Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a26280..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52/* kernel/power/hibernate.c */ 52/* kernel/power/hibernate.c */
53extern bool freezer_test_done;
54
53extern int hibernation_snapshot(int platform_mode); 55extern int hibernation_snapshot(int platform_mode);
54extern int hibernation_restore(int platform_mode); 56extern int hibernation_restore(int platform_mode);
55extern int hibernation_platform_enter(void); 57extern int hibernation_platform_enter(void);
@@ -146,6 +148,7 @@ extern int swsusp_swap_in_use(void);
146 */ 148 */
147#define SF_PLATFORM_MODE 1 149#define SF_PLATFORM_MODE 1
148#define SF_NOCOMPRESS_MODE 2 150#define SF_NOCOMPRESS_MODE 2
151#define SF_CRC32_MODE 4
149 152
150/* kernel/power/hibernate.c */ 153/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 154extern int swsusp_check(void);
@@ -228,7 +231,8 @@ extern int pm_test_level;
228#ifdef CONFIG_SUSPEND_FREEZER 231#ifdef CONFIG_SUSPEND_FREEZER
229static inline int suspend_freeze_processes(void) 232static inline int suspend_freeze_processes(void)
230{ 233{
231 return freeze_processes(); 234 int error = freeze_processes();
235 return error ? : freeze_kernel_threads();
232} 236}
233 237
234static inline void suspend_thaw_processes(void) 238static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9d..77274c9ba2f1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezable(struct task_struct * p) 25static int try_to_freeze_tasks(bool user_only)
26{
27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) ||
29 (p->exit_state != 0))
30 return 0;
31 return 1;
32}
33
34static int try_to_freeze_tasks(bool sig_only)
35{ 26{
36 struct task_struct *g, *p; 27 struct task_struct *g, *p;
37 unsigned long end_time; 28 unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
46 37
47 end_time = jiffies + TIMEOUT; 38 end_time = jiffies + TIMEOUT;
48 39
49 if (!sig_only) 40 if (!user_only)
50 freeze_workqueues_begin(); 41 freeze_workqueues_begin();
51 42
52 while (true) { 43 while (true) {
53 todo = 0; 44 todo = 0;
54 read_lock(&tasklist_lock); 45 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 46 do_each_thread(g, p) {
56 if (frozen(p) || !freezable(p)) 47 if (p == current || !freeze_task(p))
57 continue;
58
59 if (!freeze_task(p, sig_only))
60 continue; 48 continue;
61 49
62 /* 50 /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
77 } while_each_thread(g, p); 65 } while_each_thread(g, p);
78 read_unlock(&tasklist_lock); 66 read_unlock(&tasklist_lock);
79 67
80 if (!sig_only) { 68 if (!user_only) {
81 wq_busy = freeze_workqueues_busy(); 69 wq_busy = freeze_workqueues_busy();
82 todo += wq_busy; 70 todo += wq_busy;
83 } 71 }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
103 elapsed_csecs = elapsed_csecs64; 91 elapsed_csecs = elapsed_csecs64;
104 92
105 if (todo) { 93 if (todo) {
106 /* This does not unfreeze processes that are already frozen
107 * (we have slightly ugly calling convention in that respect,
108 * and caller must call thaw_processes() if something fails),
109 * but it cleans up leftover PF_FREEZE requests.
110 */
111 printk("\n"); 94 printk("\n");
112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 95 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
113 "(%d tasks refusing to freeze, wq_busy=%d):\n", 96 "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
115 elapsed_csecs / 100, elapsed_csecs % 100, 98 elapsed_csecs / 100, elapsed_csecs % 100,
116 todo - wq_busy, wq_busy); 99 todo - wq_busy, wq_busy);
117 100
118 thaw_workqueues();
119
120 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
121 do_each_thread(g, p) { 102 do_each_thread(g, p) {
122 task_lock(p); 103 if (!wakeup && !freezer_should_skip(p) &&
123 if (!wakeup && freezing(p) && !freezer_should_skip(p)) 104 p != current && freezing(p) && !frozen(p))
124 sched_show_task(p); 105 sched_show_task(p);
125 cancel_freezing(p);
126 task_unlock(p);
127 } while_each_thread(g, p); 106 } while_each_thread(g, p);
128 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
129 } else { 108 } else {
@@ -135,60 +114,76 @@ static int try_to_freeze_tasks(bool sig_only)
135} 114}
136 115
137/** 116/**
138 * freeze_processes - tell processes to enter the refrigerator 117 * freeze_processes - Signal user space processes to enter the refrigerator.
118 *
119 * On success, returns 0. On failure, -errno and system is fully thawed.
139 */ 120 */
140int freeze_processes(void) 121int freeze_processes(void)
141{ 122{
142 int error; 123 int error;
143 124
125 if (!pm_freezing)
126 atomic_inc(&system_freezing_cnt);
127
144 printk("Freezing user space processes ... "); 128 printk("Freezing user space processes ... ");
129 pm_freezing = true;
145 error = try_to_freeze_tasks(true); 130 error = try_to_freeze_tasks(true);
131 if (!error) {
132 printk("done.");
133 oom_killer_disable();
134 }
135 printk("\n");
136 BUG_ON(in_atomic());
137
146 if (error) 138 if (error)
147 goto Exit; 139 thaw_processes();
148 printk("done.\n"); 140 return error;
141}
142
143/**
144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
145 *
146 * On success, returns 0. On failure, -errno and system is fully thawed.
147 */
148int freeze_kernel_threads(void)
149{
150 int error;
149 151
150 printk("Freezing remaining freezable tasks ... "); 152 printk("Freezing remaining freezable tasks ... ");
153 pm_nosig_freezing = true;
151 error = try_to_freeze_tasks(false); 154 error = try_to_freeze_tasks(false);
152 if (error) 155 if (!error)
153 goto Exit; 156 printk("done.");
154 printk("done.");
155 157
156 oom_killer_disable();
157 Exit:
158 BUG_ON(in_atomic());
159 printk("\n"); 158 printk("\n");
159 BUG_ON(in_atomic());
160 160
161 if (error)
162 thaw_processes();
161 return error; 163 return error;
162} 164}
163 165
164static void thaw_tasks(bool nosig_only) 166void thaw_processes(void)
165{ 167{
166 struct task_struct *g, *p; 168 struct task_struct *g, *p;
167 169
168 read_lock(&tasklist_lock); 170 if (pm_freezing)
169 do_each_thread(g, p) { 171 atomic_dec(&system_freezing_cnt);
170 if (!freezable(p)) 172 pm_freezing = false;
171 continue; 173 pm_nosig_freezing = false;
172 174
173 if (nosig_only && should_send_signal(p)) 175 oom_killer_enable();
174 continue;
175 176
176 if (cgroup_freezing_or_frozen(p)) 177 printk("Restarting tasks ... ");
177 continue;
178 178
179 thaw_process(p); 179 thaw_workqueues();
180
181 read_lock(&tasklist_lock);
182 do_each_thread(g, p) {
183 __thaw_task(p);
180 } while_each_thread(g, p); 184 } while_each_thread(g, p);
181 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
182}
183
184void thaw_processes(void)
185{
186 oom_killer_enable();
187 186
188 printk("Restarting tasks ... ");
189 thaw_workqueues();
190 thaw_tasks(true);
191 thaw_tasks(false);
192 schedule(); 187 schedule();
193 printk("done.\n"); 188 printk("done.\n");
194} 189}
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f0793..995e3bd3417b 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
29 29
30/*#define DEBUG*/ 30/*#define DEBUG*/
31 31
32#include <linux/pm_qos_params.h> 32#include <linux/pm_qos.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
@@ -43,64 +43,61 @@
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44 44
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/export.h>
46 47
47/* 48/*
48 * locking rule: all changes to requests or notifiers lists 49 * locking rule: all changes to constraints or notifiers lists
49 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 50 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
50 * held, taken with _irqsave. One lock to rule them all 51 * held, taken with _irqsave. One lock to rule them all
51 */ 52 */
52enum pm_qos_type {
53 PM_QOS_MAX, /* return the largest value */
54 PM_QOS_MIN /* return the smallest value */
55};
56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
62struct pm_qos_object { 53struct pm_qos_object {
63 struct plist_head requests; 54 struct pm_qos_constraints *constraints;
64 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 55 struct miscdevice pm_qos_power_miscdev;
66 char *name; 56 char *name;
67 s32 target_value; /* Do not change to 64 bit */
68 s32 default_value;
69 enum pm_qos_type type;
70}; 57};
71 58
72static DEFINE_SPINLOCK(pm_qos_lock); 59static DEFINE_SPINLOCK(pm_qos_lock);
73 60
74static struct pm_qos_object null_pm_qos; 61static struct pm_qos_object null_pm_qos;
62
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 63static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 64static struct pm_qos_constraints cpu_dma_constraints = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), 65 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 66 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 67 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
82 .type = PM_QOS_MIN, 68 .type = PM_QOS_MIN,
69 .notifiers = &cpu_dma_lat_notifier,
70};
71static struct pm_qos_object cpu_dma_pm_qos = {
72 .constraints = &cpu_dma_constraints,
73 .name = "cpu_dma_latency",
83}; 74};
84 75
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 76static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 77static struct pm_qos_constraints network_lat_constraints = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), 78 .list = PLIST_HEAD_INIT(network_lat_constraints.list),
88 .notifiers = &network_lat_notifier,
89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 79 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 80 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
92 .type = PM_QOS_MIN 81 .type = PM_QOS_MIN,
82 .notifiers = &network_lat_notifier,
83};
84static struct pm_qos_object network_lat_pm_qos = {
85 .constraints = &network_lat_constraints,
86 .name = "network_latency",
93}; 87};
94 88
95 89
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 90static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 91static struct pm_qos_constraints network_tput_constraints = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), 92 .list = PLIST_HEAD_INIT(network_tput_constraints.list),
99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 93 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 94 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
103 .type = PM_QOS_MAX, 95 .type = PM_QOS_MAX,
96 .notifiers = &network_throughput_notifier,
97};
98static struct pm_qos_object network_throughput_pm_qos = {
99 .constraints = &network_tput_constraints,
100 .name = "network_throughput",
104}; 101};
105 102
106 103
@@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = {
127}; 124};
128 125
129/* unlocked internal variant */ 126/* unlocked internal variant */
130static inline int pm_qos_get_value(struct pm_qos_object *o) 127static inline int pm_qos_get_value(struct pm_qos_constraints *c)
131{ 128{
132 if (plist_head_empty(&o->requests)) 129 if (plist_head_empty(&c->list))
133 return o->default_value; 130 return c->default_value;
134 131
135 switch (o->type) { 132 switch (c->type) {
136 case PM_QOS_MIN: 133 case PM_QOS_MIN:
137 return plist_first(&o->requests)->prio; 134 return plist_first(&c->list)->prio;
138 135
139 case PM_QOS_MAX: 136 case PM_QOS_MAX:
140 return plist_last(&o->requests)->prio; 137 return plist_last(&c->list)->prio;
141 138
142 default: 139 default:
143 /* runtime check for not using enum */ 140 /* runtime check for not using enum */
@@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
145 } 142 }
146} 143}
147 144
148static inline s32 pm_qos_read_value(struct pm_qos_object *o) 145s32 pm_qos_read_value(struct pm_qos_constraints *c)
149{ 146{
150 return o->target_value; 147 return c->target_value;
151} 148}
152 149
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) 150static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
154{ 151{
155 o->target_value = value; 152 c->target_value = value;
156} 153}
157 154
158static void update_target(struct pm_qos_object *o, struct plist_node *node, 155/**
159 int del, int value) 156 * pm_qos_update_target - manages the constraints list and calls the notifiers
157 * if needed
158 * @c: constraints data struct
159 * @node: request to add to the list, to update or to remove
160 * @action: action to take on the constraints list
161 * @value: value of the request to add or update
162 *
163 * This function returns 1 if the aggregated constraint value has changed, 0
164 * otherwise.
165 */
166int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
167 enum pm_qos_req_action action, int value)
160{ 168{
161 unsigned long flags; 169 unsigned long flags;
162 int prev_value, curr_value; 170 int prev_value, curr_value, new_value;
163 171
164 spin_lock_irqsave(&pm_qos_lock, flags); 172 spin_lock_irqsave(&pm_qos_lock, flags);
165 prev_value = pm_qos_get_value(o); 173 prev_value = pm_qos_get_value(c);
166 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ 174 if (value == PM_QOS_DEFAULT_VALUE)
167 if (value != PM_QOS_DEFAULT_VALUE) { 175 new_value = c->default_value;
176 else
177 new_value = value;
178
179 switch (action) {
180 case PM_QOS_REMOVE_REQ:
181 plist_del(node, &c->list);
182 break;
183 case PM_QOS_UPDATE_REQ:
168 /* 184 /*
169 * to change the list, we atomically remove, reinit 185 * to change the list, we atomically remove, reinit
170 * with new value and add, then see if the extremal 186 * with new value and add, then see if the extremal
171 * changed 187 * changed
172 */ 188 */
173 plist_del(node, &o->requests); 189 plist_del(node, &c->list);
174 plist_node_init(node, value); 190 case PM_QOS_ADD_REQ:
175 plist_add(node, &o->requests); 191 plist_node_init(node, new_value);
176 } else if (del) { 192 plist_add(node, &c->list);
177 plist_del(node, &o->requests); 193 break;
178 } else { 194 default:
179 plist_add(node, &o->requests); 195 /* no action */
196 ;
180 } 197 }
181 curr_value = pm_qos_get_value(o); 198
182 pm_qos_set_value(o, curr_value); 199 curr_value = pm_qos_get_value(c);
200 pm_qos_set_value(c, curr_value);
201
183 spin_unlock_irqrestore(&pm_qos_lock, flags); 202 spin_unlock_irqrestore(&pm_qos_lock, flags);
184 203
185 if (prev_value != curr_value) 204 if (prev_value != curr_value) {
186 blocking_notifier_call_chain(o->notifiers, 205 blocking_notifier_call_chain(c->notifiers,
187 (unsigned long)curr_value, 206 (unsigned long)curr_value,
188 NULL); 207 NULL);
189} 208 return 1;
190 209 } else {
191static int register_pm_qos_misc(struct pm_qos_object *qos) 210 return 0;
192{
193 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
194 qos->pm_qos_power_miscdev.name = qos->name;
195 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
196
197 return misc_register(&qos->pm_qos_power_miscdev);
198}
199
200static int find_pm_qos_object_by_minor(int minor)
201{
202 int pm_qos_class;
203
204 for (pm_qos_class = 0;
205 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
206 if (minor ==
207 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
208 return pm_qos_class;
209 } 211 }
210 return -1;
211} 212}
212 213
213/** 214/**
@@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor)
218 */ 219 */
219int pm_qos_request(int pm_qos_class) 220int pm_qos_request(int pm_qos_class)
220{ 221{
221 return pm_qos_read_value(pm_qos_array[pm_qos_class]); 222 return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
222} 223}
223EXPORT_SYMBOL_GPL(pm_qos_request); 224EXPORT_SYMBOL_GPL(pm_qos_request);
224 225
225int pm_qos_request_active(struct pm_qos_request_list *req) 226int pm_qos_request_active(struct pm_qos_request *req)
226{ 227{
227 return req->pm_qos_class != 0; 228 return req->pm_qos_class != 0;
228} 229}
@@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
230 231
231/** 232/**
232 * pm_qos_add_request - inserts new qos request into the list 233 * pm_qos_add_request - inserts new qos request into the list
233 * @dep: pointer to a preallocated handle 234 * @req: pointer to a preallocated handle
234 * @pm_qos_class: identifies which list of qos request to use 235 * @pm_qos_class: identifies which list of qos request to use
235 * @value: defines the qos request 236 * @value: defines the qos request
236 * 237 *
237 * This function inserts a new entry in the pm_qos_class list of requested qos 238 * This function inserts a new entry in the pm_qos_class list of requested qos
238 * performance characteristics. It recomputes the aggregate QoS expectations 239 * performance characteristics. It recomputes the aggregate QoS expectations
239 * for the pm_qos_class of parameters and initializes the pm_qos_request_list 240 * for the pm_qos_class of parameters and initializes the pm_qos_request
240 * handle. Caller needs to save this handle for later use in updates and 241 * handle. Caller needs to save this handle for later use in updates and
241 * removal. 242 * removal.
242 */ 243 */
243 244
244void pm_qos_add_request(struct pm_qos_request_list *dep, 245void pm_qos_add_request(struct pm_qos_request *req,
245 int pm_qos_class, s32 value) 246 int pm_qos_class, s32 value)
246{ 247{
247 struct pm_qos_object *o = pm_qos_array[pm_qos_class]; 248 if (!req) /*guard against callers passing in null */
248 int new_value; 249 return;
249 250
250 if (pm_qos_request_active(dep)) { 251 if (pm_qos_request_active(req)) {
251 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); 252 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
252 return; 253 return;
253 } 254 }
254 if (value == PM_QOS_DEFAULT_VALUE) 255 req->pm_qos_class = pm_qos_class;
255 new_value = o->default_value; 256 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
256 else 257 &req->node, PM_QOS_ADD_REQ, value);
257 new_value = value;
258 plist_node_init(&dep->list, new_value);
259 dep->pm_qos_class = pm_qos_class;
260 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
261} 258}
262EXPORT_SYMBOL_GPL(pm_qos_add_request); 259EXPORT_SYMBOL_GPL(pm_qos_add_request);
263 260
264/** 261/**
265 * pm_qos_update_request - modifies an existing qos request 262 * pm_qos_update_request - modifies an existing qos request
266 * @pm_qos_req : handle to list element holding a pm_qos request to use 263 * @req : handle to list element holding a pm_qos request to use
267 * @value: defines the qos request 264 * @value: defines the qos request
268 * 265 *
269 * Updates an existing qos request for the pm_qos_class of parameters along 266 * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
271 * 268 *
272 * Attempts are made to make this code callable on hot code paths. 269 * Attempts are made to make this code callable on hot code paths.
273 */ 270 */
274void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 271void pm_qos_update_request(struct pm_qos_request *req,
275 s32 new_value) 272 s32 new_value)
276{ 273{
277 s32 temp; 274 if (!req) /*guard against callers passing in null */
278 struct pm_qos_object *o;
279
280 if (!pm_qos_req) /*guard against callers passing in null */
281 return; 275 return;
282 276
283 if (!pm_qos_request_active(pm_qos_req)) { 277 if (!pm_qos_request_active(req)) {
284 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); 278 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
285 return; 279 return;
286 } 280 }
287 281
288 o = pm_qos_array[pm_qos_req->pm_qos_class]; 282 if (new_value != req->node.prio)
289 283 pm_qos_update_target(
290 if (new_value == PM_QOS_DEFAULT_VALUE) 284 pm_qos_array[req->pm_qos_class]->constraints,
291 temp = o->default_value; 285 &req->node, PM_QOS_UPDATE_REQ, new_value);
292 else
293 temp = new_value;
294
295 if (temp != pm_qos_req->list.prio)
296 update_target(o, &pm_qos_req->list, 0, temp);
297} 286}
298EXPORT_SYMBOL_GPL(pm_qos_update_request); 287EXPORT_SYMBOL_GPL(pm_qos_update_request);
299 288
300/** 289/**
301 * pm_qos_remove_request - modifies an existing qos request 290 * pm_qos_remove_request - modifies an existing qos request
302 * @pm_qos_req: handle to request list element 291 * @req: handle to request list element
303 * 292 *
304 * Will remove pm qos request from the list of requests and 293 * Will remove pm qos request from the list of constraints and
305 * recompute the current target value for the pm_qos_class. Call this 294 * recompute the current target value for the pm_qos_class. Call this
306 * on slow code paths. 295 * on slow code paths.
307 */ 296 */
308void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 297void pm_qos_remove_request(struct pm_qos_request *req)
309{ 298{
310 struct pm_qos_object *o; 299 if (!req) /*guard against callers passing in null */
311
312 if (pm_qos_req == NULL)
313 return; 300 return;
314 /* silent return to keep pcm code cleaner */ 301 /* silent return to keep pcm code cleaner */
315 302
316 if (!pm_qos_request_active(pm_qos_req)) { 303 if (!pm_qos_request_active(req)) {
317 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); 304 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
318 return; 305 return;
319 } 306 }
320 307
321 o = pm_qos_array[pm_qos_req->pm_qos_class]; 308 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
322 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); 309 &req->node, PM_QOS_REMOVE_REQ,
323 memset(pm_qos_req, 0, sizeof(*pm_qos_req)); 310 PM_QOS_DEFAULT_VALUE);
311 memset(req, 0, sizeof(*req));
324} 312}
325EXPORT_SYMBOL_GPL(pm_qos_remove_request); 313EXPORT_SYMBOL_GPL(pm_qos_remove_request);
326 314
@@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
337 int retval; 325 int retval;
338 326
339 retval = blocking_notifier_chain_register( 327 retval = blocking_notifier_chain_register(
340 pm_qos_array[pm_qos_class]->notifiers, notifier); 328 pm_qos_array[pm_qos_class]->constraints->notifiers,
329 notifier);
341 330
342 return retval; 331 return retval;
343} 332}
@@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
356 int retval; 345 int retval;
357 346
358 retval = blocking_notifier_chain_unregister( 347 retval = blocking_notifier_chain_unregister(
359 pm_qos_array[pm_qos_class]->notifiers, notifier); 348 pm_qos_array[pm_qos_class]->constraints->notifiers,
349 notifier);
360 350
361 return retval; 351 return retval;
362} 352}
363EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 353EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
364 354
355/* User space interface to PM QoS classes via misc devices */
356static int register_pm_qos_misc(struct pm_qos_object *qos)
357{
358 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
359 qos->pm_qos_power_miscdev.name = qos->name;
360 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
361
362 return misc_register(&qos->pm_qos_power_miscdev);
363}
364
365static int find_pm_qos_object_by_minor(int minor)
366{
367 int pm_qos_class;
368
369 for (pm_qos_class = 0;
370 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
371 if (minor ==
372 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
373 return pm_qos_class;
374 }
375 return -1;
376}
377
365static int pm_qos_power_open(struct inode *inode, struct file *filp) 378static int pm_qos_power_open(struct inode *inode, struct file *filp)
366{ 379{
367 long pm_qos_class; 380 long pm_qos_class;
368 381
369 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 382 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
370 if (pm_qos_class >= 0) { 383 if (pm_qos_class >= 0) {
371 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); 384 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
372 if (!req) 385 if (!req)
373 return -ENOMEM; 386 return -ENOMEM;
374 387
375 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); 388 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
376 filp->private_data = req; 389 filp->private_data = req;
377 390
378 if (filp->private_data) 391 return 0;
379 return 0;
380 } 392 }
381 return -EPERM; 393 return -EPERM;
382} 394}
383 395
384static int pm_qos_power_release(struct inode *inode, struct file *filp) 396static int pm_qos_power_release(struct inode *inode, struct file *filp)
385{ 397{
386 struct pm_qos_request_list *req; 398 struct pm_qos_request *req;
387 399
388 req = filp->private_data; 400 req = filp->private_data;
389 pm_qos_remove_request(req); 401 pm_qos_remove_request(req);
@@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
398{ 410{
399 s32 value; 411 s32 value;
400 unsigned long flags; 412 unsigned long flags;
401 struct pm_qos_object *o; 413 struct pm_qos_request *req = filp->private_data;
402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
403 414
404 if (!pm_qos_req) 415 if (!req)
405 return -EINVAL; 416 return -EINVAL;
406 if (!pm_qos_request_active(pm_qos_req)) 417 if (!pm_qos_request_active(req))
407 return -EINVAL; 418 return -EINVAL;
408 419
409 o = pm_qos_array[pm_qos_req->pm_qos_class];
410 spin_lock_irqsave(&pm_qos_lock, flags); 420 spin_lock_irqsave(&pm_qos_lock, flags);
411 value = pm_qos_get_value(o); 421 value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
412 spin_unlock_irqrestore(&pm_qos_lock, flags); 422 spin_unlock_irqrestore(&pm_qos_lock, flags);
413 423
414 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); 424 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
418 size_t count, loff_t *f_pos) 428 size_t count, loff_t *f_pos)
419{ 429{
420 s32 value; 430 s32 value;
421 struct pm_qos_request_list *pm_qos_req; 431 struct pm_qos_request *req;
422 432
423 if (count == sizeof(s32)) { 433 if (count == sizeof(s32)) {
424 if (copy_from_user(&value, buf, sizeof(s32))) 434 if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
449 return -EINVAL; 459 return -EINVAL;
450 } 460 }
451 461
452 pm_qos_req = filp->private_data; 462 req = filp->private_data;
453 pm_qos_update_request(pm_qos_req, value); 463 pm_qos_update_request(req, value);
454 464
455 return count; 465 return count;
456} 466}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d6..1cf88900ec4f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
858 PageReserved(page)) 858 PageReserved(page))
859 return NULL; 859 return NULL;
860 860
861 if (page_is_guard(page))
862 return NULL;
863
861 return page; 864 return page;
862} 865}
863 866
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
920 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 923 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
921 return NULL; 924 return NULL;
922 925
926 if (page_is_guard(page))
927 return NULL;
928
923 return page; 929 return page;
924} 930}
925 931
@@ -1339,6 +1345,9 @@ int hibernate_preallocate_memory(void)
1339 count += highmem; 1345 count += highmem;
1340 count -= totalreserve_pages; 1346 count -= totalreserve_pages;
1341 1347
1348 /* Add number of pages required for page keys (s390 only). */
1349 size += page_key_additional_pages(saveable);
1350
1342 /* Compute the maximum number of saveable pages to leave in memory. */ 1351 /* Compute the maximum number of saveable pages to leave in memory. */
1343 max_size = (count - (size + PAGES_FOR_IO)) / 2 1352 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); 1353 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1671,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1662 buf[j] = memory_bm_next_pfn(bm); 1671 buf[j] = memory_bm_next_pfn(bm);
1663 if (unlikely(buf[j] == BM_END_OF_MAP)) 1672 if (unlikely(buf[j] == BM_END_OF_MAP))
1664 break; 1673 break;
1674 /* Save page key for data page (s390 only). */
1675 page_key_read(buf + j);
1665 } 1676 }
1666} 1677}
1667 1678
@@ -1821,6 +1832,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1821 if (unlikely(buf[j] == BM_END_OF_MAP)) 1832 if (unlikely(buf[j] == BM_END_OF_MAP))
1822 break; 1833 break;
1823 1834
1835 /* Extract and buffer page key for data page (s390 only). */
1836 page_key_memorize(buf + j);
1837
1824 if (memory_bm_pfn_present(bm, buf[j])) 1838 if (memory_bm_pfn_present(bm, buf[j]))
1825 memory_bm_set_bit(bm, buf[j]); 1839 memory_bm_set_bit(bm, buf[j]);
1826 else 1840 else
@@ -2223,6 +2237,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
2223 if (error) 2237 if (error)
2224 return error; 2238 return error;
2225 2239
2240 /* Allocate buffer for page keys. */
2241 error = page_key_alloc(nr_copy_pages);
2242 if (error)
2243 return error;
2244
2226 } else if (handle->cur <= nr_meta_pages + 1) { 2245 } else if (handle->cur <= nr_meta_pages + 1) {
2227 error = unpack_orig_pfns(buffer, &copy_bm); 2246 error = unpack_orig_pfns(buffer, &copy_bm);
2228 if (error) 2247 if (error)
@@ -2243,6 +2262,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
2243 } 2262 }
2244 } else { 2263 } else {
2245 copy_last_highmem_page(); 2264 copy_last_highmem_page();
2265 /* Restore page key for data page (s390 only). */
2266 page_key_write(handle->buffer);
2246 handle->buffer = get_buffer(&orig_bm, &ca); 2267 handle->buffer = get_buffer(&orig_bm, &ca);
2247 if (IS_ERR(handle->buffer)) 2268 if (IS_ERR(handle->buffer))
2248 return PTR_ERR(handle->buffer); 2269 return PTR_ERR(handle->buffer);
@@ -2264,6 +2285,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
2264void snapshot_write_finalize(struct snapshot_handle *handle) 2285void snapshot_write_finalize(struct snapshot_handle *handle)
2265{ 2286{
2266 copy_last_highmem_page(); 2287 copy_last_highmem_page();
2288 /* Restore page key for data page (s390 only). */
2289 page_key_write(handle->buffer);
2290 page_key_free();
2267 /* Free only if we have loaded the image entirely */ 2291 /* Free only if we have loaded the image entirely */
2268 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { 2292 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2269 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2293 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208f..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,6 +12,7 @@
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/kmod.h>
15#include <linux/console.h> 16#include <linux/console.h>
16#include <linux/cpu.h> 17#include <linux/cpu.h>
17#include <linux/syscalls.h> 18#include <linux/syscalls.h>
@@ -21,6 +22,7 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/export.h>
24#include <linux/suspend.h> 26#include <linux/suspend.h>
25#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
26#include <trace/events/power.h> 28#include <trace/events/power.h>
@@ -40,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
40 */ 42 */
41void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
42{ 44{
43 mutex_lock(&pm_mutex); 45 lock_system_sleep();
44 suspend_ops = ops; 46 suspend_ops = ops;
45 mutex_unlock(&pm_mutex); 47 unlock_system_sleep();
46} 48}
47EXPORT_SYMBOL_GPL(suspend_set_ops); 49EXPORT_SYMBOL_GPL(suspend_set_ops);
48 50
@@ -107,7 +109,8 @@ static int suspend_prepare(void)
107 if (!error) 109 if (!error)
108 return 0; 110 return 0;
109 111
110 suspend_thaw_processes(); 112 suspend_stats.failed_freeze++;
113 dpm_save_failed_step(SUSPEND_FREEZE);
111 usermodehelper_enable(); 114 usermodehelper_enable();
112 Finish: 115 Finish:
113 pm_notifier_call_chain(PM_POST_SUSPEND); 116 pm_notifier_call_chain(PM_POST_SUSPEND);
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state)
315 */ 318 */
316int pm_suspend(suspend_state_t state) 319int pm_suspend(suspend_state_t state)
317{ 320{
318 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) 321 int ret;
319 return enter_state(state); 322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
323 ret = enter_state(state);
324 if (ret) {
325 suspend_stats.fail++;
326 dpm_save_failed_errno(ret);
327 } else
328 suspend_stats.success++;
329 return ret;
330 }
320 return -EINVAL; 331 return -EINVAL;
321} 332}
322EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee3..3739ecced085 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/buffer_head.h>
22#include <linux/bio.h> 21#include <linux/bio.h>
23#include <linux/blkdev.h> 22#include <linux/blkdev.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -27,6 +26,10 @@
27#include <linux/slab.h> 26#include <linux/slab.h>
28#include <linux/lzo.h> 27#include <linux/lzo.h>
29#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/cpumask.h>
30#include <linux/atomic.h>
31#include <linux/kthread.h>
32#include <linux/crc32.h>
30 33
31#include "power.h" 34#include "power.h"
32 35
@@ -43,8 +46,7 @@
43 * allocated and populated one at a time, so we only need one memory 46 * allocated and populated one at a time, so we only need one memory
44 * page to set up the entire structure. 47 * page to set up the entire structure.
45 * 48 *
46 * During resume we also only need to use one swap_map_page structure 49 * During resume we pick up all swap_map_page structures into a list.
47 * at a time.
48 */ 50 */
49 51
50#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +56,11 @@ struct swap_map_page {
54 sector_t next_swap; 56 sector_t next_swap;
55}; 57};
56 58
59struct swap_map_page_list {
60 struct swap_map_page *map;
61 struct swap_map_page_list *next;
62};
63
57/** 64/**
58 * The swap_map_handle structure is used for handling swap in 65 * The swap_map_handle structure is used for handling swap in
59 * a file-alike way 66 * a file-alike way
@@ -61,13 +68,18 @@ struct swap_map_page {
61 68
62struct swap_map_handle { 69struct swap_map_handle {
63 struct swap_map_page *cur; 70 struct swap_map_page *cur;
71 struct swap_map_page_list *maps;
64 sector_t cur_swap; 72 sector_t cur_swap;
65 sector_t first_sector; 73 sector_t first_sector;
66 unsigned int k; 74 unsigned int k;
75 unsigned long nr_free_pages, written;
76 u32 crc32;
67}; 77};
68 78
69struct swsusp_header { 79struct swsusp_header {
70 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 80 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
81 sizeof(u32)];
82 u32 crc32;
71 sector_t image; 83 sector_t image;
72 unsigned int flags; /* Flags to pass to the "boot" kernel */ 84 unsigned int flags; /* Flags to pass to the "boot" kernel */
73 char orig_sig[10]; 85 char orig_sig[10];
@@ -199,6 +211,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
199 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); 211 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
200 swsusp_header->image = handle->first_sector; 212 swsusp_header->image = handle->first_sector;
201 swsusp_header->flags = flags; 213 swsusp_header->flags = flags;
214 if (flags & SF_CRC32_MODE)
215 swsusp_header->crc32 = handle->crc32;
202 error = hib_bio_write_page(swsusp_resume_block, 216 error = hib_bio_write_page(swsusp_resume_block,
203 swsusp_header, NULL); 217 swsusp_header, NULL);
204 } else { 218 } else {
@@ -245,6 +259,7 @@ static int swsusp_swap_check(void)
245static int write_page(void *buf, sector_t offset, struct bio **bio_chain) 259static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
246{ 260{
247 void *src; 261 void *src;
262 int ret;
248 263
249 if (!offset) 264 if (!offset)
250 return -ENOSPC; 265 return -ENOSPC;
@@ -254,9 +269,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
254 if (src) { 269 if (src) {
255 copy_page(src, buf); 270 copy_page(src, buf);
256 } else { 271 } else {
257 WARN_ON_ONCE(1); 272 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
258 bio_chain = NULL; /* Go synchronous */ 273 if (ret)
259 src = buf; 274 return ret;
275 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
276 if (src) {
277 copy_page(src, buf);
278 } else {
279 WARN_ON_ONCE(1);
280 bio_chain = NULL; /* Go synchronous */
281 src = buf;
282 }
260 } 283 }
261 } else { 284 } else {
262 src = buf; 285 src = buf;
@@ -293,6 +316,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
293 goto err_rel; 316 goto err_rel;
294 } 317 }
295 handle->k = 0; 318 handle->k = 0;
319 handle->nr_free_pages = nr_free_pages() >> 1;
320 handle->written = 0;
296 handle->first_sector = handle->cur_swap; 321 handle->first_sector = handle->cur_swap;
297 return 0; 322 return 0;
298err_rel: 323err_rel:
@@ -316,20 +341,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
316 return error; 341 return error;
317 handle->cur->entries[handle->k++] = offset; 342 handle->cur->entries[handle->k++] = offset;
318 if (handle->k >= MAP_PAGE_ENTRIES) { 343 if (handle->k >= MAP_PAGE_ENTRIES) {
319 error = hib_wait_on_bio_chain(bio_chain);
320 if (error)
321 goto out;
322 offset = alloc_swapdev_block(root_swap); 344 offset = alloc_swapdev_block(root_swap);
323 if (!offset) 345 if (!offset)
324 return -ENOSPC; 346 return -ENOSPC;
325 handle->cur->next_swap = offset; 347 handle->cur->next_swap = offset;
326 error = write_page(handle->cur, handle->cur_swap, NULL); 348 error = write_page(handle->cur, handle->cur_swap, bio_chain);
327 if (error) 349 if (error)
328 goto out; 350 goto out;
329 clear_page(handle->cur); 351 clear_page(handle->cur);
330 handle->cur_swap = offset; 352 handle->cur_swap = offset;
331 handle->k = 0; 353 handle->k = 0;
332 } 354 }
355 if (bio_chain && ++handle->written > handle->nr_free_pages) {
356 error = hib_wait_on_bio_chain(bio_chain);
357 if (error)
358 goto out;
359 handle->written = 0;
360 }
333 out: 361 out:
334 return error; 362 return error;
335} 363}
@@ -372,6 +400,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
372 LZO_HEADER, PAGE_SIZE) 400 LZO_HEADER, PAGE_SIZE)
373#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) 401#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
374 402
403/* Maximum number of threads for compression/decompression. */
404#define LZO_THREADS 3
405
406/* Maximum number of pages for read buffering. */
407#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
408
409
375/** 410/**
376 * save_image - save the suspend image data 411 * save_image - save the suspend image data
377 */ 412 */
@@ -419,6 +454,92 @@ static int save_image(struct swap_map_handle *handle,
419 return ret; 454 return ret;
420} 455}
421 456
457/**
458 * Structure used for CRC32.
459 */
460struct crc_data {
461 struct task_struct *thr; /* thread */
462 atomic_t ready; /* ready to start flag */
463 atomic_t stop; /* ready to stop flag */
464 unsigned run_threads; /* nr current threads */
465 wait_queue_head_t go; /* start crc update */
466 wait_queue_head_t done; /* crc update done */
467 u32 *crc32; /* points to handle's crc32 */
468 size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
469 unsigned char *unc[LZO_THREADS]; /* uncompressed data */
470};
471
472/**
473 * CRC32 update function that runs in its own thread.
474 */
475static int crc32_threadfn(void *data)
476{
477 struct crc_data *d = data;
478 unsigned i;
479
480 while (1) {
481 wait_event(d->go, atomic_read(&d->ready) ||
482 kthread_should_stop());
483 if (kthread_should_stop()) {
484 d->thr = NULL;
485 atomic_set(&d->stop, 1);
486 wake_up(&d->done);
487 break;
488 }
489 atomic_set(&d->ready, 0);
490
491 for (i = 0; i < d->run_threads; i++)
492 *d->crc32 = crc32_le(*d->crc32,
493 d->unc[i], *d->unc_len[i]);
494 atomic_set(&d->stop, 1);
495 wake_up(&d->done);
496 }
497 return 0;
498}
499/**
500 * Structure used for LZO data compression.
501 */
502struct cmp_data {
503 struct task_struct *thr; /* thread */
504 atomic_t ready; /* ready to start flag */
505 atomic_t stop; /* ready to stop flag */
506 int ret; /* return code */
507 wait_queue_head_t go; /* start compression */
508 wait_queue_head_t done; /* compression done */
509 size_t unc_len; /* uncompressed length */
510 size_t cmp_len; /* compressed length */
511 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
512 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
513 unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
514};
515
516/**
517 * Compression function that runs in its own thread.
518 */
519static int lzo_compress_threadfn(void *data)
520{
521 struct cmp_data *d = data;
522
523 while (1) {
524 wait_event(d->go, atomic_read(&d->ready) ||
525 kthread_should_stop());
526 if (kthread_should_stop()) {
527 d->thr = NULL;
528 d->ret = -1;
529 atomic_set(&d->stop, 1);
530 wake_up(&d->done);
531 break;
532 }
533 atomic_set(&d->ready, 0);
534
535 d->ret = lzo1x_1_compress(d->unc, d->unc_len,
536 d->cmp + LZO_HEADER, &d->cmp_len,
537 d->wrk);
538 atomic_set(&d->stop, 1);
539 wake_up(&d->done);
540 }
541 return 0;
542}
422 543
423/** 544/**
424 * save_image_lzo - Save the suspend image data compressed with LZO. 545 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +558,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
437 struct bio *bio; 558 struct bio *bio;
438 struct timeval start; 559 struct timeval start;
439 struct timeval stop; 560 struct timeval stop;
440 size_t off, unc_len, cmp_len; 561 size_t off;
441 unsigned char *unc, *cmp, *wrk, *page; 562 unsigned thr, run_threads, nr_threads;
563 unsigned char *page = NULL;
564 struct cmp_data *data = NULL;
565 struct crc_data *crc = NULL;
566
567 /*
568 * We'll limit the number of threads for compression to limit memory
569 * footprint.
570 */
571 nr_threads = num_online_cpus() - 1;
572 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
442 573
443 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 574 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
444 if (!page) { 575 if (!page) {
445 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 576 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
446 return -ENOMEM; 577 ret = -ENOMEM;
578 goto out_clean;
447 } 579 }
448 580
449 wrk = vmalloc(LZO1X_1_MEM_COMPRESS); 581 data = vmalloc(sizeof(*data) * nr_threads);
450 if (!wrk) { 582 if (!data) {
451 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); 583 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
452 free_page((unsigned long)page); 584 ret = -ENOMEM;
453 return -ENOMEM; 585 goto out_clean;
454 } 586 }
587 for (thr = 0; thr < nr_threads; thr++)
588 memset(&data[thr], 0, offsetof(struct cmp_data, go));
455 589
456 unc = vmalloc(LZO_UNC_SIZE); 590 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
457 if (!unc) { 591 if (!crc) {
458 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 592 printk(KERN_ERR "PM: Failed to allocate crc\n");
459 vfree(wrk); 593 ret = -ENOMEM;
460 free_page((unsigned long)page); 594 goto out_clean;
461 return -ENOMEM; 595 }
596 memset(crc, 0, offsetof(struct crc_data, go));
597
598 /*
599 * Start the compression threads.
600 */
601 for (thr = 0; thr < nr_threads; thr++) {
602 init_waitqueue_head(&data[thr].go);
603 init_waitqueue_head(&data[thr].done);
604
605 data[thr].thr = kthread_run(lzo_compress_threadfn,
606 &data[thr],
607 "image_compress/%u", thr);
608 if (IS_ERR(data[thr].thr)) {
609 data[thr].thr = NULL;
610 printk(KERN_ERR
611 "PM: Cannot start compression threads\n");
612 ret = -ENOMEM;
613 goto out_clean;
614 }
462 } 615 }
463 616
464 cmp = vmalloc(LZO_CMP_SIZE); 617 /*
465 if (!cmp) { 618 * Adjust number of free pages after all allocations have been done.
466 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 619 * We don't want to run out of pages when writing.
467 vfree(unc); 620 */
468 vfree(wrk); 621 handle->nr_free_pages = nr_free_pages() >> 1;
469 free_page((unsigned long)page); 622
470 return -ENOMEM; 623 /*
624 * Start the CRC32 thread.
625 */
626 init_waitqueue_head(&crc->go);
627 init_waitqueue_head(&crc->done);
628
629 handle->crc32 = 0;
630 crc->crc32 = &handle->crc32;
631 for (thr = 0; thr < nr_threads; thr++) {
632 crc->unc[thr] = data[thr].unc;
633 crc->unc_len[thr] = &data[thr].unc_len;
634 }
635
636 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
637 if (IS_ERR(crc->thr)) {
638 crc->thr = NULL;
639 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
640 ret = -ENOMEM;
641 goto out_clean;
471 } 642 }
472 643
473 printk(KERN_INFO 644 printk(KERN_INFO
645 "PM: Using %u thread(s) for compression.\n"
474 "PM: Compressing and saving image data (%u pages) ... ", 646 "PM: Compressing and saving image data (%u pages) ... ",
475 nr_to_write); 647 nr_threads, nr_to_write);
476 m = nr_to_write / 100; 648 m = nr_to_write / 100;
477 if (!m) 649 if (!m)
478 m = 1; 650 m = 1;
@@ -480,55 +652,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
480 bio = NULL; 652 bio = NULL;
481 do_gettimeofday(&start); 653 do_gettimeofday(&start);
482 for (;;) { 654 for (;;) {
483 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 655 for (thr = 0; thr < nr_threads; thr++) {
484 ret = snapshot_read_next(snapshot); 656 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
485 if (ret < 0) 657 ret = snapshot_read_next(snapshot);
486 goto out_finish; 658 if (ret < 0)
487 659 goto out_finish;
488 if (!ret) 660
661 if (!ret)
662 break;
663
664 memcpy(data[thr].unc + off,
665 data_of(*snapshot), PAGE_SIZE);
666
667 if (!(nr_pages % m))
668 printk(KERN_CONT "\b\b\b\b%3d%%",
669 nr_pages / m);
670 nr_pages++;
671 }
672 if (!off)
489 break; 673 break;
490 674
491 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); 675 data[thr].unc_len = off;
492 676
493 if (!(nr_pages % m)) 677 atomic_set(&data[thr].ready, 1);
494 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 678 wake_up(&data[thr].go);
495 nr_pages++;
496 } 679 }
497 680
498 if (!off) 681 if (!thr)
499 break; 682 break;
500 683
501 unc_len = off; 684 crc->run_threads = thr;
502 ret = lzo1x_1_compress(unc, unc_len, 685 atomic_set(&crc->ready, 1);
503 cmp + LZO_HEADER, &cmp_len, wrk); 686 wake_up(&crc->go);
504 if (ret < 0) {
505 printk(KERN_ERR "PM: LZO compression failed\n");
506 break;
507 }
508 687
509 if (unlikely(!cmp_len || 688 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
510 cmp_len > lzo1x_worst_compress(unc_len))) { 689 wait_event(data[thr].done,
511 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 690 atomic_read(&data[thr].stop));
512 ret = -1; 691 atomic_set(&data[thr].stop, 0);
513 break;
514 }
515 692
516 *(size_t *)cmp = cmp_len; 693 ret = data[thr].ret;
517 694
518 /* 695 if (ret < 0) {
519 * Given we are writing one page at a time to disk, we copy 696 printk(KERN_ERR "PM: LZO compression failed\n");
520 * that much from the buffer, although the last bit will likely 697 goto out_finish;
521 * be smaller than full page. This is OK - we saved the length 698 }
522 * of the compressed data, so any garbage at the end will be
523 * discarded when we read it.
524 */
525 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
526 memcpy(page, cmp + off, PAGE_SIZE);
527 699
528 ret = swap_write_page(handle, page, &bio); 700 if (unlikely(!data[thr].cmp_len ||
529 if (ret) 701 data[thr].cmp_len >
702 lzo1x_worst_compress(data[thr].unc_len))) {
703 printk(KERN_ERR
704 "PM: Invalid LZO compressed length\n");
705 ret = -1;
530 goto out_finish; 706 goto out_finish;
707 }
708
709 *(size_t *)data[thr].cmp = data[thr].cmp_len;
710
711 /*
712 * Given we are writing one page at a time to disk, we
713 * copy that much from the buffer, although the last
714 * bit will likely be smaller than full page. This is
715 * OK - we saved the length of the compressed data, so
716 * any garbage at the end will be discarded when we
717 * read it.
718 */
719 for (off = 0;
720 off < LZO_HEADER + data[thr].cmp_len;
721 off += PAGE_SIZE) {
722 memcpy(page, data[thr].cmp + off, PAGE_SIZE);
723
724 ret = swap_write_page(handle, page, &bio);
725 if (ret)
726 goto out_finish;
727 }
531 } 728 }
729
730 wait_event(crc->done, atomic_read(&crc->stop));
731 atomic_set(&crc->stop, 0);
532 } 732 }
533 733
534out_finish: 734out_finish:
@@ -536,16 +736,25 @@ out_finish:
536 do_gettimeofday(&stop); 736 do_gettimeofday(&stop);
537 if (!ret) 737 if (!ret)
538 ret = err2; 738 ret = err2;
539 if (!ret) 739 if (!ret) {
540 printk(KERN_CONT "\b\b\b\bdone\n"); 740 printk(KERN_CONT "\b\b\b\bdone\n");
541 else 741 } else {
542 printk(KERN_CONT "\n"); 742 printk(KERN_CONT "\n");
743 }
543 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 744 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
544 745out_clean:
545 vfree(cmp); 746 if (crc) {
546 vfree(unc); 747 if (crc->thr)
547 vfree(wrk); 748 kthread_stop(crc->thr);
548 free_page((unsigned long)page); 749 kfree(crc);
750 }
751 if (data) {
752 for (thr = 0; thr < nr_threads; thr++)
753 if (data[thr].thr)
754 kthread_stop(data[thr].thr);
755 vfree(data);
756 }
757 if (page) free_page((unsigned long)page);
549 758
550 return ret; 759 return ret;
551} 760}
@@ -625,8 +834,15 @@ out_finish:
625 834
626static void release_swap_reader(struct swap_map_handle *handle) 835static void release_swap_reader(struct swap_map_handle *handle)
627{ 836{
628 if (handle->cur) 837 struct swap_map_page_list *tmp;
629 free_page((unsigned long)handle->cur); 838
839 while (handle->maps) {
840 if (handle->maps->map)
841 free_page((unsigned long)handle->maps->map);
842 tmp = handle->maps;
843 handle->maps = handle->maps->next;
844 kfree(tmp);
845 }
630 handle->cur = NULL; 846 handle->cur = NULL;
631} 847}
632 848
@@ -634,22 +850,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
634 unsigned int *flags_p) 850 unsigned int *flags_p)
635{ 851{
636 int error; 852 int error;
853 struct swap_map_page_list *tmp, *last;
854 sector_t offset;
637 855
638 *flags_p = swsusp_header->flags; 856 *flags_p = swsusp_header->flags;
639 857
640 if (!swsusp_header->image) /* how can this happen? */ 858 if (!swsusp_header->image) /* how can this happen? */
641 return -EINVAL; 859 return -EINVAL;
642 860
643 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 861 handle->cur = NULL;
644 if (!handle->cur) 862 last = handle->maps = NULL;
645 return -ENOMEM; 863 offset = swsusp_header->image;
864 while (offset) {
865 tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
866 if (!tmp) {
867 release_swap_reader(handle);
868 return -ENOMEM;
869 }
870 memset(tmp, 0, sizeof(*tmp));
871 if (!handle->maps)
872 handle->maps = tmp;
873 if (last)
874 last->next = tmp;
875 last = tmp;
876
877 tmp->map = (struct swap_map_page *)
878 __get_free_page(__GFP_WAIT | __GFP_HIGH);
879 if (!tmp->map) {
880 release_swap_reader(handle);
881 return -ENOMEM;
882 }
646 883
647 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); 884 error = hib_bio_read_page(offset, tmp->map, NULL);
648 if (error) { 885 if (error) {
649 release_swap_reader(handle); 886 release_swap_reader(handle);
650 return error; 887 return error;
888 }
889 offset = tmp->map->next_swap;
651 } 890 }
652 handle->k = 0; 891 handle->k = 0;
892 handle->cur = handle->maps->map;
653 return 0; 893 return 0;
654} 894}
655 895
@@ -658,6 +898,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
658{ 898{
659 sector_t offset; 899 sector_t offset;
660 int error; 900 int error;
901 struct swap_map_page_list *tmp;
661 902
662 if (!handle->cur) 903 if (!handle->cur)
663 return -EINVAL; 904 return -EINVAL;
@@ -668,13 +909,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
668 if (error) 909 if (error)
669 return error; 910 return error;
670 if (++handle->k >= MAP_PAGE_ENTRIES) { 911 if (++handle->k >= MAP_PAGE_ENTRIES) {
671 error = hib_wait_on_bio_chain(bio_chain);
672 handle->k = 0; 912 handle->k = 0;
673 offset = handle->cur->next_swap; 913 free_page((unsigned long)handle->maps->map);
674 if (!offset) 914 tmp = handle->maps;
915 handle->maps = handle->maps->next;
916 kfree(tmp);
917 if (!handle->maps)
675 release_swap_reader(handle); 918 release_swap_reader(handle);
676 else if (!error) 919 else
677 error = hib_bio_read_page(offset, handle->cur, NULL); 920 handle->cur = handle->maps->map;
678 } 921 }
679 return error; 922 return error;
680} 923}
@@ -697,7 +940,7 @@ static int load_image(struct swap_map_handle *handle,
697 unsigned int nr_to_read) 940 unsigned int nr_to_read)
698{ 941{
699 unsigned int m; 942 unsigned int m;
700 int error = 0; 943 int ret = 0;
701 struct timeval start; 944 struct timeval start;
702 struct timeval stop; 945 struct timeval stop;
703 struct bio *bio; 946 struct bio *bio;
@@ -713,15 +956,15 @@ static int load_image(struct swap_map_handle *handle,
713 bio = NULL; 956 bio = NULL;
714 do_gettimeofday(&start); 957 do_gettimeofday(&start);
715 for ( ; ; ) { 958 for ( ; ; ) {
716 error = snapshot_write_next(snapshot); 959 ret = snapshot_write_next(snapshot);
717 if (error <= 0) 960 if (ret <= 0)
718 break; 961 break;
719 error = swap_read_page(handle, data_of(*snapshot), &bio); 962 ret = swap_read_page(handle, data_of(*snapshot), &bio);
720 if (error) 963 if (ret)
721 break; 964 break;
722 if (snapshot->sync_read) 965 if (snapshot->sync_read)
723 error = hib_wait_on_bio_chain(&bio); 966 ret = hib_wait_on_bio_chain(&bio);
724 if (error) 967 if (ret)
725 break; 968 break;
726 if (!(nr_pages % m)) 969 if (!(nr_pages % m))
727 printk("\b\b\b\b%3d%%", nr_pages / m); 970 printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +972,61 @@ static int load_image(struct swap_map_handle *handle,
729 } 972 }
730 err2 = hib_wait_on_bio_chain(&bio); 973 err2 = hib_wait_on_bio_chain(&bio);
731 do_gettimeofday(&stop); 974 do_gettimeofday(&stop);
732 if (!error) 975 if (!ret)
733 error = err2; 976 ret = err2;
734 if (!error) { 977 if (!ret) {
735 printk("\b\b\b\bdone\n"); 978 printk("\b\b\b\bdone\n");
736 snapshot_write_finalize(snapshot); 979 snapshot_write_finalize(snapshot);
737 if (!snapshot_image_loaded(snapshot)) 980 if (!snapshot_image_loaded(snapshot))
738 error = -ENODATA; 981 ret = -ENODATA;
739 } else 982 } else
740 printk("\n"); 983 printk("\n");
741 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 984 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
742 return error; 985 return ret;
986}
987
988/**
989 * Structure used for LZO data decompression.
990 */
991struct dec_data {
992 struct task_struct *thr; /* thread */
993 atomic_t ready; /* ready to start flag */
994 atomic_t stop; /* ready to stop flag */
995 int ret; /* return code */
996 wait_queue_head_t go; /* start decompression */
997 wait_queue_head_t done; /* decompression done */
998 size_t unc_len; /* uncompressed length */
999 size_t cmp_len; /* compressed length */
1000 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
1001 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
1002};
1003
1004/**
1005 * Deompression function that runs in its own thread.
1006 */
1007static int lzo_decompress_threadfn(void *data)
1008{
1009 struct dec_data *d = data;
1010
1011 while (1) {
1012 wait_event(d->go, atomic_read(&d->ready) ||
1013 kthread_should_stop());
1014 if (kthread_should_stop()) {
1015 d->thr = NULL;
1016 d->ret = -1;
1017 atomic_set(&d->stop, 1);
1018 wake_up(&d->done);
1019 break;
1020 }
1021 atomic_set(&d->ready, 0);
1022
1023 d->unc_len = LZO_UNC_SIZE;
1024 d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
1025 d->unc, &d->unc_len);
1026 atomic_set(&d->stop, 1);
1027 wake_up(&d->done);
1028 }
1029 return 0;
743} 1030}
744 1031
745/** 1032/**
@@ -753,50 +1040,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
753 unsigned int nr_to_read) 1040 unsigned int nr_to_read)
754{ 1041{
755 unsigned int m; 1042 unsigned int m;
756 int error = 0; 1043 int ret = 0;
1044 int eof = 0;
757 struct bio *bio; 1045 struct bio *bio;
758 struct timeval start; 1046 struct timeval start;
759 struct timeval stop; 1047 struct timeval stop;
760 unsigned nr_pages; 1048 unsigned nr_pages;
761 size_t i, off, unc_len, cmp_len; 1049 size_t off;
762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; 1050 unsigned i, thr, run_threads, nr_threads;
763 1051 unsigned ring = 0, pg = 0, ring_size = 0,
764 for (i = 0; i < LZO_CMP_PAGES; i++) { 1052 have = 0, want, need, asked = 0;
765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 1053 unsigned long read_pages;
766 if (!page[i]) { 1054 unsigned char **page = NULL;
767 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1055 struct dec_data *data = NULL;
1056 struct crc_data *crc = NULL;
1057
1058 /*
1059 * We'll limit the number of threads for decompression to limit memory
1060 * footprint.
1061 */
1062 nr_threads = num_online_cpus() - 1;
1063 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1064
1065 page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
1066 if (!page) {
1067 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1068 ret = -ENOMEM;
1069 goto out_clean;
1070 }
768 1071
769 while (i) 1072 data = vmalloc(sizeof(*data) * nr_threads);
770 free_page((unsigned long)page[--i]); 1073 if (!data) {
1074 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
1075 ret = -ENOMEM;
1076 goto out_clean;
1077 }
1078 for (thr = 0; thr < nr_threads; thr++)
1079 memset(&data[thr], 0, offsetof(struct dec_data, go));
771 1080
772 return -ENOMEM; 1081 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
1082 if (!crc) {
1083 printk(KERN_ERR "PM: Failed to allocate crc\n");
1084 ret = -ENOMEM;
1085 goto out_clean;
1086 }
1087 memset(crc, 0, offsetof(struct crc_data, go));
1088
1089 /*
1090 * Start the decompression threads.
1091 */
1092 for (thr = 0; thr < nr_threads; thr++) {
1093 init_waitqueue_head(&data[thr].go);
1094 init_waitqueue_head(&data[thr].done);
1095
1096 data[thr].thr = kthread_run(lzo_decompress_threadfn,
1097 &data[thr],
1098 "image_decompress/%u", thr);
1099 if (IS_ERR(data[thr].thr)) {
1100 data[thr].thr = NULL;
1101 printk(KERN_ERR
1102 "PM: Cannot start decompression threads\n");
1103 ret = -ENOMEM;
1104 goto out_clean;
773 } 1105 }
774 } 1106 }
775 1107
776 unc = vmalloc(LZO_UNC_SIZE); 1108 /*
777 if (!unc) { 1109 * Start the CRC32 thread.
778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 1110 */
779 1111 init_waitqueue_head(&crc->go);
780 for (i = 0; i < LZO_CMP_PAGES; i++) 1112 init_waitqueue_head(&crc->done);
781 free_page((unsigned long)page[i]); 1113
782 1114 handle->crc32 = 0;
783 return -ENOMEM; 1115 crc->crc32 = &handle->crc32;
1116 for (thr = 0; thr < nr_threads; thr++) {
1117 crc->unc[thr] = data[thr].unc;
1118 crc->unc_len[thr] = &data[thr].unc_len;
784 } 1119 }
785 1120
786 cmp = vmalloc(LZO_CMP_SIZE); 1121 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
787 if (!cmp) { 1122 if (IS_ERR(crc->thr)) {
788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 1123 crc->thr = NULL;
1124 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
1125 ret = -ENOMEM;
1126 goto out_clean;
1127 }
789 1128
790 vfree(unc); 1129 /*
791 for (i = 0; i < LZO_CMP_PAGES; i++) 1130 * Adjust number of pages for read buffering, in case we are short.
792 free_page((unsigned long)page[i]); 1131 */
1132 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
1133 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
793 1134
794 return -ENOMEM; 1135 for (i = 0; i < read_pages; i++) {
1136 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1137 __GFP_WAIT | __GFP_HIGH :
1138 __GFP_WAIT);
1139 if (!page[i]) {
1140 if (i < LZO_CMP_PAGES) {
1141 ring_size = i;
1142 printk(KERN_ERR
1143 "PM: Failed to allocate LZO pages\n");
1144 ret = -ENOMEM;
1145 goto out_clean;
1146 } else {
1147 break;
1148 }
1149 }
795 } 1150 }
1151 want = ring_size = i;
796 1152
797 printk(KERN_INFO 1153 printk(KERN_INFO
1154 "PM: Using %u thread(s) for decompression.\n"
798 "PM: Loading and decompressing image data (%u pages) ... ", 1155 "PM: Loading and decompressing image data (%u pages) ... ",
799 nr_to_read); 1156 nr_threads, nr_to_read);
800 m = nr_to_read / 100; 1157 m = nr_to_read / 100;
801 if (!m) 1158 if (!m)
802 m = 1; 1159 m = 1;
@@ -804,85 +1161,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
804 bio = NULL; 1161 bio = NULL;
805 do_gettimeofday(&start); 1162 do_gettimeofday(&start);
806 1163
807 error = snapshot_write_next(snapshot); 1164 ret = snapshot_write_next(snapshot);
808 if (error <= 0) 1165 if (ret <= 0)
809 goto out_finish; 1166 goto out_finish;
810 1167
811 for (;;) { 1168 for(;;) {
812 error = swap_read_page(handle, page[0], NULL); /* sync */ 1169 for (i = 0; !eof && i < want; i++) {
813 if (error) 1170 ret = swap_read_page(handle, page[ring], &bio);
814 break; 1171 if (ret) {
815 1172 /*
816 cmp_len = *(size_t *)page[0]; 1173 * On real read error, finish. On end of data,
817 if (unlikely(!cmp_len || 1174 * set EOF flag and just exit the read loop.
818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 1175 */
819 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 1176 if (handle->cur &&
820 error = -1; 1177 handle->cur->entries[handle->k]) {
821 break; 1178 goto out_finish;
1179 } else {
1180 eof = 1;
1181 break;
1182 }
1183 }
1184 if (++ring >= ring_size)
1185 ring = 0;
822 } 1186 }
1187 asked += i;
1188 want -= i;
823 1189
824 for (off = PAGE_SIZE, i = 1; 1190 /*
825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { 1191 * We are out of data, wait for some more.
826 error = swap_read_page(handle, page[i], &bio); 1192 */
827 if (error) 1193 if (!have) {
1194 if (!asked)
1195 break;
1196
1197 ret = hib_wait_on_bio_chain(&bio);
1198 if (ret)
828 goto out_finish; 1199 goto out_finish;
1200 have += asked;
1201 asked = 0;
1202 if (eof)
1203 eof = 2;
829 } 1204 }
830 1205
831 error = hib_wait_on_bio_chain(&bio); /* need all data now */ 1206 if (crc->run_threads) {
832 if (error) 1207 wait_event(crc->done, atomic_read(&crc->stop));
833 goto out_finish; 1208 atomic_set(&crc->stop, 0);
834 1209 crc->run_threads = 0;
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
838 } 1210 }
839 1211
840 unc_len = LZO_UNC_SIZE; 1212 for (thr = 0; have && thr < nr_threads; thr++) {
841 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, 1213 data[thr].cmp_len = *(size_t *)page[pg];
842 unc, &unc_len); 1214 if (unlikely(!data[thr].cmp_len ||
843 if (error < 0) { 1215 data[thr].cmp_len >
844 printk(KERN_ERR "PM: LZO decompression failed\n"); 1216 lzo1x_worst_compress(LZO_UNC_SIZE))) {
845 break; 1217 printk(KERN_ERR
1218 "PM: Invalid LZO compressed length\n");
1219 ret = -1;
1220 goto out_finish;
1221 }
1222
1223 need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
1224 PAGE_SIZE);
1225 if (need > have) {
1226 if (eof > 1) {
1227 ret = -1;
1228 goto out_finish;
1229 }
1230 break;
1231 }
1232
1233 for (off = 0;
1234 off < LZO_HEADER + data[thr].cmp_len;
1235 off += PAGE_SIZE) {
1236 memcpy(data[thr].cmp + off,
1237 page[pg], PAGE_SIZE);
1238 have--;
1239 want++;
1240 if (++pg >= ring_size)
1241 pg = 0;
1242 }
1243
1244 atomic_set(&data[thr].ready, 1);
1245 wake_up(&data[thr].go);
846 } 1246 }
847 1247
848 if (unlikely(!unc_len || 1248 /*
849 unc_len > LZO_UNC_SIZE || 1249 * Wait for more data while we are decompressing.
850 unc_len & (PAGE_SIZE - 1))) { 1250 */
851 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); 1251 if (have < LZO_CMP_PAGES && asked) {
852 error = -1; 1252 ret = hib_wait_on_bio_chain(&bio);
853 break; 1253 if (ret)
1254 goto out_finish;
1255 have += asked;
1256 asked = 0;
1257 if (eof)
1258 eof = 2;
854 } 1259 }
855 1260
856 for (off = 0; off < unc_len; off += PAGE_SIZE) { 1261 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
857 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); 1262 wait_event(data[thr].done,
1263 atomic_read(&data[thr].stop));
1264 atomic_set(&data[thr].stop, 0);
1265
1266 ret = data[thr].ret;
858 1267
859 if (!(nr_pages % m)) 1268 if (ret < 0) {
860 printk("\b\b\b\b%3d%%", nr_pages / m); 1269 printk(KERN_ERR
861 nr_pages++; 1270 "PM: LZO decompression failed\n");
1271 goto out_finish;
1272 }
862 1273
863 error = snapshot_write_next(snapshot); 1274 if (unlikely(!data[thr].unc_len ||
864 if (error <= 0) 1275 data[thr].unc_len > LZO_UNC_SIZE ||
1276 data[thr].unc_len & (PAGE_SIZE - 1))) {
1277 printk(KERN_ERR
1278 "PM: Invalid LZO uncompressed length\n");
1279 ret = -1;
865 goto out_finish; 1280 goto out_finish;
1281 }
1282
1283 for (off = 0;
1284 off < data[thr].unc_len; off += PAGE_SIZE) {
1285 memcpy(data_of(*snapshot),
1286 data[thr].unc + off, PAGE_SIZE);
1287
1288 if (!(nr_pages % m))
1289 printk("\b\b\b\b%3d%%", nr_pages / m);
1290 nr_pages++;
1291
1292 ret = snapshot_write_next(snapshot);
1293 if (ret <= 0) {
1294 crc->run_threads = thr + 1;
1295 atomic_set(&crc->ready, 1);
1296 wake_up(&crc->go);
1297 goto out_finish;
1298 }
1299 }
866 } 1300 }
1301
1302 crc->run_threads = thr;
1303 atomic_set(&crc->ready, 1);
1304 wake_up(&crc->go);
867 } 1305 }
868 1306
869out_finish: 1307out_finish:
1308 if (crc->run_threads) {
1309 wait_event(crc->done, atomic_read(&crc->stop));
1310 atomic_set(&crc->stop, 0);
1311 }
870 do_gettimeofday(&stop); 1312 do_gettimeofday(&stop);
871 if (!error) { 1313 if (!ret) {
872 printk("\b\b\b\bdone\n"); 1314 printk("\b\b\b\bdone\n");
873 snapshot_write_finalize(snapshot); 1315 snapshot_write_finalize(snapshot);
874 if (!snapshot_image_loaded(snapshot)) 1316 if (!snapshot_image_loaded(snapshot))
875 error = -ENODATA; 1317 ret = -ENODATA;
1318 if (!ret) {
1319 if (swsusp_header->flags & SF_CRC32_MODE) {
1320 if(handle->crc32 != swsusp_header->crc32) {
1321 printk(KERN_ERR
1322 "PM: Invalid image CRC32!\n");
1323 ret = -ENODATA;
1324 }
1325 }
1326 }
876 } else 1327 } else
877 printk("\n"); 1328 printk("\n");
878 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1329 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
879 1330out_clean:
880 vfree(cmp); 1331 for (i = 0; i < ring_size; i++)
881 vfree(unc);
882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]); 1332 free_page((unsigned long)page[i]);
1333 if (crc) {
1334 if (crc->thr)
1335 kthread_stop(crc->thr);
1336 kfree(crc);
1337 }
1338 if (data) {
1339 for (thr = 0; thr < nr_threads; thr++)
1340 if (data[thr].thr)
1341 kthread_stop(data[thr].thr);
1342 vfree(data);
1343 }
1344 if (page) vfree(page);
884 1345
885 return error; 1346 return ret;
886} 1347}
887 1348
888/** 1349/**
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 42ddbc6f0de6..6b1ab7a88522 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,6 +12,7 @@
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/kmod.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/device.h> 17#include <linux/device.h>
17#include <linux/miscdevice.h> 18#include <linux/miscdevice.h>
@@ -20,6 +21,7 @@
20#include <linux/swapops.h> 21#include <linux/swapops.h>
21#include <linux/pm.h> 22#include <linux/pm.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/compat.h>
23#include <linux/console.h> 25#include <linux/console.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
25#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -29,28 +31,6 @@
29 31
30#include "power.h" 32#include "power.h"
31 33
32/*
33 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
34 * will be removed in the future. They are only preserved here for
35 * compatibility with existing userland utilities.
36 */
37#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
38#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
39
40#define PMOPS_PREPARE 1
41#define PMOPS_ENTER 2
42#define PMOPS_FINISH 3
43
44/*
45 * NOTE: The following ioctl definitions are wrong and have been replaced with
46 * correct ones. They are only preserved here for compatibility with existing
47 * userland utilities and will be removed in the future.
48 */
49#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
50#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
51#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
52#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
53
54 34
55#define SNAPSHOT_MINOR 231 35#define SNAPSHOT_MINOR 231
56 36
@@ -70,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
70 struct snapshot_data *data; 50 struct snapshot_data *data;
71 int error; 51 int error;
72 52
73 mutex_lock(&pm_mutex); 53 lock_system_sleep();
74 54
75 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 55 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
76 error = -EBUSY; 56 error = -EBUSY;
@@ -122,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
122 data->platform_support = 0; 102 data->platform_support = 0;
123 103
124 Unlock: 104 Unlock:
125 mutex_unlock(&pm_mutex); 105 unlock_system_sleep();
126 106
127 return error; 107 return error;
128} 108}
@@ -131,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
131{ 111{
132 struct snapshot_data *data; 112 struct snapshot_data *data;
133 113
134 mutex_lock(&pm_mutex); 114 lock_system_sleep();
135 115
136 swsusp_free(); 116 swsusp_free();
137 free_basic_memory_bitmaps(); 117 free_basic_memory_bitmaps();
@@ -145,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
145 PM_POST_HIBERNATION : PM_POST_RESTORE); 125 PM_POST_HIBERNATION : PM_POST_RESTORE);
146 atomic_inc(&snapshot_device_available); 126 atomic_inc(&snapshot_device_available);
147 127
148 mutex_unlock(&pm_mutex); 128 unlock_system_sleep();
149 129
150 return 0; 130 return 0;
151} 131}
@@ -157,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
157 ssize_t res; 137 ssize_t res;
158 loff_t pg_offp = *offp & ~PAGE_MASK; 138 loff_t pg_offp = *offp & ~PAGE_MASK;
159 139
160 mutex_lock(&pm_mutex); 140 lock_system_sleep();
161 141
162 data = filp->private_data; 142 data = filp->private_data;
163 if (!data->ready) { 143 if (!data->ready) {
@@ -178,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
178 *offp += res; 158 *offp += res;
179 159
180 Unlock: 160 Unlock:
181 mutex_unlock(&pm_mutex); 161 unlock_system_sleep();
182 162
183 return res; 163 return res;
184} 164}
@@ -190,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
190 ssize_t res; 170 ssize_t res;
191 loff_t pg_offp = *offp & ~PAGE_MASK; 171 loff_t pg_offp = *offp & ~PAGE_MASK;
192 172
193 mutex_lock(&pm_mutex); 173 lock_system_sleep();
194 174
195 data = filp->private_data; 175 data = filp->private_data;
196 176
@@ -207,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
207 if (res > 0) 187 if (res > 0)
208 *offp += res; 188 *offp += res;
209unlock: 189unlock:
210 mutex_unlock(&pm_mutex); 190 unlock_system_sleep();
211 191
212 return res; 192 return res;
213} 193}
214 194
215static void snapshot_deprecated_ioctl(unsigned int cmd)
216{
217 if (printk_ratelimit())
218 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
219 "be removed soon, update your suspend-to-disk "
220 "utilities\n",
221 __builtin_return_address(0), cmd);
222}
223
224static long snapshot_ioctl(struct file *filp, unsigned int cmd, 195static long snapshot_ioctl(struct file *filp, unsigned int cmd,
225 unsigned long arg) 196 unsigned long arg)
226{ 197{
@@ -256,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
256 break; 227 break;
257 228
258 error = freeze_processes(); 229 error = freeze_processes();
259 if (error) { 230 if (error)
260 thaw_processes();
261 usermodehelper_enable(); 231 usermodehelper_enable();
262 } 232 else
263 if (!error)
264 data->frozen = 1; 233 data->frozen = 1;
265 break; 234 break;
266 235
@@ -273,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
273 data->frozen = 0; 242 data->frozen = 0;
274 break; 243 break;
275 244
276 case SNAPSHOT_ATOMIC_SNAPSHOT:
277 snapshot_deprecated_ioctl(cmd);
278 case SNAPSHOT_CREATE_IMAGE: 245 case SNAPSHOT_CREATE_IMAGE:
279 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 246 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
280 error = -EPERM; 247 error = -EPERM;
@@ -282,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
282 } 249 }
283 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
284 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
285 if (!error) 252 if (!error) {
286 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
287 if (!error) 254 if (!error && !freezer_test_done)
288 data->ready = 1; 255 data->ready = 1;
256 if (freezer_test_done) {
257 freezer_test_done = false;
258 thaw_processes();
259 }
260 }
289 break; 261 break;
290 262
291 case SNAPSHOT_ATOMIC_RESTORE: 263 case SNAPSHOT_ATOMIC_RESTORE:
@@ -304,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
304 data->ready = 0; 276 data->ready = 0;
305 break; 277 break;
306 278
307 case SNAPSHOT_SET_IMAGE_SIZE:
308 snapshot_deprecated_ioctl(cmd);
309 case SNAPSHOT_PREF_IMAGE_SIZE: 279 case SNAPSHOT_PREF_IMAGE_SIZE:
310 image_size = arg; 280 image_size = arg;
311 break; 281 break;
@@ -320,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
320 error = put_user(size, (loff_t __user *)arg); 290 error = put_user(size, (loff_t __user *)arg);
321 break; 291 break;
322 292
323 case SNAPSHOT_AVAIL_SWAP:
324 snapshot_deprecated_ioctl(cmd);
325 case SNAPSHOT_AVAIL_SWAP_SIZE: 293 case SNAPSHOT_AVAIL_SWAP_SIZE:
326 size = count_swap_pages(data->swap, 1); 294 size = count_swap_pages(data->swap, 1);
327 size <<= PAGE_SHIFT; 295 size <<= PAGE_SHIFT;
328 error = put_user(size, (loff_t __user *)arg); 296 error = put_user(size, (loff_t __user *)arg);
329 break; 297 break;
330 298
331 case SNAPSHOT_GET_SWAP_PAGE:
332 snapshot_deprecated_ioctl(cmd);
333 case SNAPSHOT_ALLOC_SWAP_PAGE: 299 case SNAPSHOT_ALLOC_SWAP_PAGE:
334 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 300 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
335 error = -ENODEV; 301 error = -ENODEV;
@@ -352,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
352 free_all_swap_pages(data->swap); 318 free_all_swap_pages(data->swap);
353 break; 319 break;
354 320
355 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
356 snapshot_deprecated_ioctl(cmd);
357 if (!swsusp_swap_in_use()) {
358 /*
359 * User space encodes device types as two-byte values,
360 * so we need to recode them
361 */
362 if (old_decode_dev(arg)) {
363 data->swap = swap_type_of(old_decode_dev(arg),
364 0, NULL);
365 if (data->swap < 0)
366 error = -ENODEV;
367 } else {
368 data->swap = -1;
369 error = -EINVAL;
370 }
371 } else {
372 error = -EPERM;
373 }
374 break;
375
376 case SNAPSHOT_S2RAM: 321 case SNAPSHOT_S2RAM:
377 if (!data->frozen) { 322 if (!data->frozen) {
378 error = -EPERM; 323 error = -EPERM;
@@ -395,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
395 error = hibernation_platform_enter(); 340 error = hibernation_platform_enter();
396 break; 341 break;
397 342
398 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
399 snapshot_deprecated_ioctl(cmd);
400 error = -EINVAL;
401
402 switch (arg) {
403
404 case PMOPS_PREPARE:
405 data->platform_support = 1;
406 error = 0;
407 break;
408
409 case PMOPS_ENTER:
410 if (data->platform_support)
411 error = hibernation_platform_enter();
412 break;
413
414 case PMOPS_FINISH:
415 if (data->platform_support)
416 error = 0;
417 break;
418
419 default:
420 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
421
422 }
423 break;
424
425 case SNAPSHOT_SET_SWAP_AREA: 343 case SNAPSHOT_SET_SWAP_AREA:
426 if (swsusp_swap_in_use()) { 344 if (swsusp_swap_in_use()) {
427 error = -EPERM; 345 error = -EPERM;
@@ -463,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
463 return error; 381 return error;
464} 382}
465 383
384#ifdef CONFIG_COMPAT
385
386struct compat_resume_swap_area {
387 compat_loff_t offset;
388 u32 dev;
389} __packed;
390
391static long
392snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
393{
394 BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
395
396 switch (cmd) {
397 case SNAPSHOT_GET_IMAGE_SIZE:
398 case SNAPSHOT_AVAIL_SWAP_SIZE:
399 case SNAPSHOT_ALLOC_SWAP_PAGE: {
400 compat_loff_t __user *uoffset = compat_ptr(arg);
401 loff_t offset;
402 mm_segment_t old_fs;
403 int err;
404
405 old_fs = get_fs();
406 set_fs(KERNEL_DS);
407 err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
408 set_fs(old_fs);
409 if (!err && put_user(offset, uoffset))
410 err = -EFAULT;
411 return err;
412 }
413
414 case SNAPSHOT_CREATE_IMAGE:
415 return snapshot_ioctl(file, cmd,
416 (unsigned long) compat_ptr(arg));
417
418 case SNAPSHOT_SET_SWAP_AREA: {
419 struct compat_resume_swap_area __user *u_swap_area =
420 compat_ptr(arg);
421 struct resume_swap_area swap_area;
422 mm_segment_t old_fs;
423 int err;
424
425 err = get_user(swap_area.offset, &u_swap_area->offset);
426 err |= get_user(swap_area.dev, &u_swap_area->dev);
427 if (err)
428 return -EFAULT;
429 old_fs = get_fs();
430 set_fs(KERNEL_DS);
431 err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
432 (unsigned long) &swap_area);
433 set_fs(old_fs);
434 return err;
435 }
436
437 default:
438 return snapshot_ioctl(file, cmd, arg);
439 }
440}
441
442#endif /* CONFIG_COMPAT */
443
466static const struct file_operations snapshot_fops = { 444static const struct file_operations snapshot_fops = {
467 .open = snapshot_open, 445 .open = snapshot_open,
468 .release = snapshot_release, 446 .release = snapshot_release,
@@ -470,6 +448,9 @@ static const struct file_operations snapshot_fops = {
470 .write = snapshot_write, 448 .write = snapshot_write,
471 .llseek = no_llseek, 449 .llseek = no_llseek,
472 .unlocked_ioctl = snapshot_ioctl, 450 .unlocked_ioctl = snapshot_ioctl,
451#ifdef CONFIG_COMPAT
452 .compat_ioctl = snapshot_compat_ioctl,
453#endif
473}; 454};
474 455
475static struct miscdevice snapshot_device = { 456static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 28a40d8171b8..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
100 * It is also used in interesting ways to provide interlocking in 100 * It is also used in interesting ways to provide interlocking in
101 * console_unlock();. 101 * console_unlock();.
102 */ 102 */
103static DEFINE_SPINLOCK(logbuf_lock); 103static DEFINE_RAW_SPINLOCK(logbuf_lock);
104 104
105#define LOG_BUF_MASK (log_buf_len-1) 105#define LOG_BUF_MASK (log_buf_len-1)
106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) 106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
199 unsigned long mem; 199 unsigned long mem;
200 200
201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR) 202 if (!mem)
203 return; 203 return;
204 new_log_buf = __va(mem); 204 new_log_buf = __va(mem);
205 } else { 205 } else {
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
212 return; 212 return;
213 } 213 }
214 214
215 spin_lock_irqsave(&logbuf_lock, flags); 215 raw_spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len; 216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf; 217 log_buf = new_log_buf;
218 new_log_buf_len = 0; 218 new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
230 log_start -= offset; 230 log_start -= offset;
231 con_start -= offset; 231 con_start -= offset;
232 log_end -= offset; 232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags); 233 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
234 234
235 pr_info("log_buf_len: %d\n", log_buf_len); 235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n", 236 pr_info("early log buf free: %d(%d%%)\n",
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
365 if (error) 365 if (error)
366 goto out; 366 goto out;
367 i = 0; 367 i = 0;
368 spin_lock_irq(&logbuf_lock); 368 raw_spin_lock_irq(&logbuf_lock);
369 while (!error && (log_start != log_end) && i < len) { 369 while (!error && (log_start != log_end) && i < len) {
370 c = LOG_BUF(log_start); 370 c = LOG_BUF(log_start);
371 log_start++; 371 log_start++;
372 spin_unlock_irq(&logbuf_lock); 372 raw_spin_unlock_irq(&logbuf_lock);
373 error = __put_user(c,buf); 373 error = __put_user(c,buf);
374 buf++; 374 buf++;
375 i++; 375 i++;
376 cond_resched(); 376 cond_resched();
377 spin_lock_irq(&logbuf_lock); 377 raw_spin_lock_irq(&logbuf_lock);
378 } 378 }
379 spin_unlock_irq(&logbuf_lock); 379 raw_spin_unlock_irq(&logbuf_lock);
380 if (!error) 380 if (!error)
381 error = i; 381 error = i;
382 break; 382 break;
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 count = len; 399 count = len;
400 if (count > log_buf_len) 400 if (count > log_buf_len)
401 count = log_buf_len; 401 count = log_buf_len;
402 spin_lock_irq(&logbuf_lock); 402 raw_spin_lock_irq(&logbuf_lock);
403 if (count > logged_chars) 403 if (count > logged_chars)
404 count = logged_chars; 404 count = logged_chars;
405 if (do_clear) 405 if (do_clear)
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
416 if (j + log_buf_len < log_end) 416 if (j + log_buf_len < log_end)
417 break; 417 break;
418 c = LOG_BUF(j); 418 c = LOG_BUF(j);
419 spin_unlock_irq(&logbuf_lock); 419 raw_spin_unlock_irq(&logbuf_lock);
420 error = __put_user(c,&buf[count-1-i]); 420 error = __put_user(c,&buf[count-1-i]);
421 cond_resched(); 421 cond_resched();
422 spin_lock_irq(&logbuf_lock); 422 raw_spin_lock_irq(&logbuf_lock);
423 } 423 }
424 spin_unlock_irq(&logbuf_lock); 424 raw_spin_unlock_irq(&logbuf_lock);
425 if (error) 425 if (error)
426 break; 426 break;
427 error = i; 427 error = i;
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
521 } 521 }
522} 522}
523 523
524static int __read_mostly ignore_loglevel; 524static bool __read_mostly ignore_loglevel;
525 525
526static int __init ignore_loglevel_setup(char *str) 526static int __init ignore_loglevel_setup(char *str)
527{ 527{
@@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str)
532} 532}
533 533
534early_param("ignore_loglevel", ignore_loglevel_setup); 534early_param("ignore_loglevel", ignore_loglevel_setup);
535module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
537 "print all kernel messages to the console.");
535 538
536/* 539/*
537 * Write out chars from start to end - 1 inclusive 540 * Write out chars from start to end - 1 inclusive
@@ -592,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
592 /* multi digit including the level and facility number */ 595 /* multi digit including the level and facility number */
593 char *endp = NULL; 596 char *endp = NULL;
594 597
595 if (p[1] < '0' && p[1] > '9')
596 return 0;
597
598 lev = (simple_strtoul(&p[1], &endp, 10) & 7); 598 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
599 if (endp == NULL || endp[0] != '>') 599 if (endp == NULL || endp[0] != '>')
600 return 0; 600 return 0;
@@ -688,16 +688,17 @@ static void zap_locks(void)
688 688
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 debug_locks_off();
691 /* If a crash is occurring, make sure we can't deadlock */ 692 /* If a crash is occurring, make sure we can't deadlock */
692 spin_lock_init(&logbuf_lock); 693 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 694 /* And make sure that we print immediately */
694 sema_init(&console_sem, 1); 695 sema_init(&console_sem, 1);
695} 696}
696 697
697#if defined(CONFIG_PRINTK_TIME) 698#if defined(CONFIG_PRINTK_TIME)
698static int printk_time = 1; 699static bool printk_time = 1;
699#else 700#else
700static int printk_time = 0; 701static bool printk_time = 0;
701#endif 702#endif
702module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
703 704
@@ -802,9 +803,9 @@ static int console_trylock_for_printk(unsigned int cpu)
802 } 803 }
803 } 804 }
804 printk_cpu = UINT_MAX; 805 printk_cpu = UINT_MAX;
805 spin_unlock(&logbuf_lock);
806 if (wake) 806 if (wake)
807 up(&console_sem); 807 up(&console_sem);
808 raw_spin_unlock(&logbuf_lock);
808 return retval; 809 return retval;
809} 810}
810static const char recursion_bug_msg [] = 811static const char recursion_bug_msg [] =
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
840 boot_delay_msec(); 841 boot_delay_msec();
841 printk_delay(); 842 printk_delay();
842 843
843 preempt_disable();
844 /* This stops the holder of console_sem just where we want him */ 844 /* This stops the holder of console_sem just where we want him */
845 raw_local_irq_save(flags); 845 local_irq_save(flags);
846 this_cpu = smp_processor_id(); 846 this_cpu = smp_processor_id();
847 847
848 /* 848 /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 * recursion and return - but flag the recursion so that 856 * recursion and return - but flag the recursion so that
857 * it can be printed at the next appropriate moment: 857 * it can be printed at the next appropriate moment:
858 */ 858 */
859 if (!oops_in_progress) { 859 if (!oops_in_progress && !lockdep_recursing(current)) {
860 recursion_bug = 1; 860 recursion_bug = 1;
861 goto out_restore_irqs; 861 goto out_restore_irqs;
862 } 862 }
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
864 } 864 }
865 865
866 lockdep_off(); 866 lockdep_off();
867 spin_lock(&logbuf_lock); 867 raw_spin_lock(&logbuf_lock);
868 printk_cpu = this_cpu; 868 printk_cpu = this_cpu;
869 869
870 if (recursion_bug) { 870 if (recursion_bug) {
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
962 962
963 lockdep_on(); 963 lockdep_on();
964out_restore_irqs: 964out_restore_irqs:
965 raw_local_irq_restore(flags); 965 local_irq_restore(flags);
966 966
967 preempt_enable();
968 return printed_len; 967 return printed_len;
969} 968}
970EXPORT_SYMBOL(printk); 969EXPORT_SYMBOL(printk);
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1099 return -1; 1098 return -1;
1100} 1099}
1101 1100
1102int console_suspend_enabled = 1; 1101bool console_suspend_enabled = 1;
1103EXPORT_SYMBOL(console_suspend_enabled); 1102EXPORT_SYMBOL(console_suspend_enabled);
1104 1103
1105static int __init console_suspend_disable(char *str) 1104static int __init console_suspend_disable(char *str)
@@ -1108,6 +1107,10 @@ static int __init console_suspend_disable(char *str)
1108 return 1; 1107 return 1;
1109} 1108}
1110__setup("no_console_suspend", console_suspend_disable); 1109__setup("no_console_suspend", console_suspend_disable);
1110module_param_named(console_suspend, console_suspend_enabled,
1111 bool, S_IRUGO | S_IWUSR);
1112MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
1113 " and hibernate operations");
1111 1114
1112/** 1115/**
1113 * suspend_console - suspend the console subsystem 1116 * suspend_console - suspend the console subsystem
@@ -1257,14 +1260,14 @@ void console_unlock(void)
1257 1260
1258again: 1261again:
1259 for ( ; ; ) { 1262 for ( ; ; ) {
1260 spin_lock_irqsave(&logbuf_lock, flags); 1263 raw_spin_lock_irqsave(&logbuf_lock, flags);
1261 wake_klogd |= log_start - log_end; 1264 wake_klogd |= log_start - log_end;
1262 if (con_start == log_end) 1265 if (con_start == log_end)
1263 break; /* Nothing to print */ 1266 break; /* Nothing to print */
1264 _con_start = con_start; 1267 _con_start = con_start;
1265 _log_end = log_end; 1268 _log_end = log_end;
1266 con_start = log_end; /* Flush */ 1269 con_start = log_end; /* Flush */
1267 spin_unlock(&logbuf_lock); 1270 raw_spin_unlock(&logbuf_lock);
1268 stop_critical_timings(); /* don't trace print latency */ 1271 stop_critical_timings(); /* don't trace print latency */
1269 call_console_drivers(_con_start, _log_end); 1272 call_console_drivers(_con_start, _log_end);
1270 start_critical_timings(); 1273 start_critical_timings();
@@ -1276,7 +1279,7 @@ again:
1276 if (unlikely(exclusive_console)) 1279 if (unlikely(exclusive_console))
1277 exclusive_console = NULL; 1280 exclusive_console = NULL;
1278 1281
1279 spin_unlock(&logbuf_lock); 1282 raw_spin_unlock(&logbuf_lock);
1280 1283
1281 up(&console_sem); 1284 up(&console_sem);
1282 1285
@@ -1286,10 +1289,11 @@ again:
1286 * there's a new owner and the console_unlock() from them will do the 1289 * there's a new owner and the console_unlock() from them will do the
1287 * flush, no worries. 1290 * flush, no worries.
1288 */ 1291 */
1289 spin_lock(&logbuf_lock); 1292 raw_spin_lock(&logbuf_lock);
1290 if (con_start != log_end) 1293 if (con_start != log_end)
1291 retry = 1; 1294 retry = 1;
1292 spin_unlock_irqrestore(&logbuf_lock, flags); 1295 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1296
1293 if (retry && console_trylock()) 1297 if (retry && console_trylock())
1294 goto again; 1298 goto again;
1295 1299
@@ -1522,9 +1526,9 @@ void register_console(struct console *newcon)
1522 * console_unlock(); will print out the buffered messages 1526 * console_unlock(); will print out the buffered messages
1523 * for us. 1527 * for us.
1524 */ 1528 */
1525 spin_lock_irqsave(&logbuf_lock, flags); 1529 raw_spin_lock_irqsave(&logbuf_lock, flags);
1526 con_start = log_start; 1530 con_start = log_start;
1527 spin_unlock_irqrestore(&logbuf_lock, flags); 1531 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1528 /* 1532 /*
1529 * We're about to replay the log buffer. Only do this to the 1533 * We're about to replay the log buffer. Only do this to the
1530 * just-registered console to avoid excessive message spam to 1534 * just-registered console to avoid excessive message spam to
@@ -1731,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1731 /* Theoretically, the log could move on after we do this, but 1735 /* Theoretically, the log could move on after we do this, but
1732 there's not a lot we can do about that. The new messages 1736 there's not a lot we can do about that. The new messages
1733 will overwrite the start of what we dump. */ 1737 will overwrite the start of what we dump. */
1734 spin_lock_irqsave(&logbuf_lock, flags); 1738 raw_spin_lock_irqsave(&logbuf_lock, flags);
1735 end = log_end & LOG_BUF_MASK; 1739 end = log_end & LOG_BUF_MASK;
1736 chars = logged_chars; 1740 chars = logged_chars;
1737 spin_unlock_irqrestore(&logbuf_lock, flags); 1741 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1738 1742
1739 if (chars > end) { 1743 if (chars > end) {
1740 s1 = log_buf + log_buf_len - chars + end; 1744 s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/profile.c b/kernel/profile.c
index 961b389fe52f..76b8e77773ee 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,7 @@
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/notifier.h> 19#include <linux/notifier.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c890ac9a7962..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
8 */ 8 */
9 9
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/module.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
96 */ 96 */
97 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
98 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
99 child->signal->group_stop_count)) 99 child->signal->group_stop_count)) {
100 child->jobctl |= JOBCTL_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
101 101
102 /*
103 * This is only possible if this thread was cloned by the
104 * traced task running in the stopped group, set the signal
105 * for the future reports.
106 * FIXME: we should change ptrace_init_task() to handle this
107 * case.
108 */
109 if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
110 child->jobctl |= SIGSTOP;
111 }
112
102 /* 113 /*
103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 114 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
104 * @child in the butt. Note that @resume should be used iff @child 115 * @child in the butt. Note that @resume should be used iff @child
diff --git a/kernel/range.c b/kernel/range.c
index 37fa9b99ad58..9b8ae2d6ed68 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Range add and subtract 2 * Range add and subtract
3 */ 3 */
4#include <linux/module.h> 4#include <linux/kernel.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 000000000000..aa88baab5f78
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,92 @@
1/*
2 * Read-Copy Update definitions shared among RCU implementations.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2011
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H
25
26#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */
29#define RCU_TRACE(stmt)
30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31
32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
41 * by call_rcu() and rcu callback execution, and are therefore not part of the
42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
43 */
44
45#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
46# define STATE_RCU_HEAD_READY 0
47# define STATE_RCU_HEAD_QUEUED 1
48
49extern struct debug_obj_descr rcuhead_debug_descr;
50
51static inline void debug_rcu_head_queue(struct rcu_head *head)
52{
53 WARN_ON_ONCE((unsigned long)head & 0x3);
54 debug_object_activate(head, &rcuhead_debug_descr);
55 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_READY,
57 STATE_RCU_HEAD_QUEUED);
58}
59
60static inline void debug_rcu_head_unqueue(struct rcu_head *head)
61{
62 debug_object_active_state(head, &rcuhead_debug_descr,
63 STATE_RCU_HEAD_QUEUED,
64 STATE_RCU_HEAD_READY);
65 debug_object_deactivate(head, &rcuhead_debug_descr);
66}
67#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
68static inline void debug_rcu_head_queue(struct rcu_head *head)
69{
70}
71
72static inline void debug_rcu_head_unqueue(struct rcu_head *head)
73{
74}
75#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
76
77extern void kfree(const void *);
78
79static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
80{
81 unsigned long offset = (unsigned long)head->func;
82
83 if (__is_kfree_rcu_offset(offset)) {
84 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
85 kfree((void *)head - offset);
86 } else {
87 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
88 head->func(head);
89 }
90}
91
92#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be61..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,9 +43,14 @@
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h>
51
52#include "rcu.h"
53
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 54#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 55static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 56struct lockdep_map rcu_lock_map =
@@ -88,17 +93,24 @@ int rcu_read_lock_bh_held(void)
88{ 93{
89 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
90 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
91 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
92} 99}
93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
94 101
95#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 102#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
96 103
104struct rcu_synchronize {
105 struct rcu_head head;
106 struct completion completion;
107};
108
97/* 109/*
98 * Awaken the corresponding synchronize_rcu() instance now that a 110 * Awaken the corresponding synchronize_rcu() instance now that a
99 * grace period has elapsed. 111 * grace period has elapsed.
100 */ 112 */
101void wakeme_after_rcu(struct rcu_head *head) 113static void wakeme_after_rcu(struct rcu_head *head)
102{ 114{
103 struct rcu_synchronize *rcu; 115 struct rcu_synchronize *rcu;
104 116
@@ -106,6 +118,20 @@ void wakeme_after_rcu(struct rcu_head *head)
106 complete(&rcu->completion); 118 complete(&rcu->completion);
107} 119}
108 120
121void wait_rcu_gp(call_rcu_func_t crf)
122{
123 struct rcu_synchronize rcu;
124
125 init_rcu_head_on_stack(&rcu.head);
126 init_completion(&rcu.completion);
127 /* Will wake me after RCU finished. */
128 crf(&rcu.head, wakeme_after_rcu);
129 /* Wait for it. */
130 wait_for_completion(&rcu.completion);
131 destroy_rcu_head_on_stack(&rcu.head);
132}
133EXPORT_SYMBOL_GPL(wait_rcu_gp);
134
109#ifdef CONFIG_PROVE_RCU 135#ifdef CONFIG_PROVE_RCU
110/* 136/*
111 * wrapper function to avoid #include problems. 137 * wrapper function to avoid #include problems.
@@ -292,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
292}; 318};
293EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
294#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5ab..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,13 +22,12 @@
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 */ 24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h> 25#include <linux/completion.h>
27#include <linux/interrupt.h> 26#include <linux/interrupt.h>
28#include <linux/notifier.h> 27#include <linux/notifier.h>
29#include <linux/rcupdate.h> 28#include <linux/rcupdate.h>
30#include <linux/kernel.h> 29#include <linux/kernel.h>
31#include <linux/module.h> 30#include <linux/export.h>
32#include <linux/mutex.h> 31#include <linux/mutex.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
34#include <linux/types.h> 33#include <linux/types.h>
@@ -37,47 +36,154 @@
37#include <linux/cpu.h> 36#include <linux/cpu.h>
38#include <linux/prefetch.h> 37#include <linux/prefetch.h>
39 38
40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 39#ifdef CONFIG_RCU_TRACE
41static struct task_struct *rcu_kthread_task; 40#include <trace/events/rcu.h>
42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 41#endif /* #else #ifdef CONFIG_RCU_TRACE */
43static unsigned long have_rcu_kthread_work; 42
43#include "rcu.h"
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void); 47static void invoke_rcu_callbacks(void);
48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static int rcu_kthread(void *arg); 49static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state. 98 */
99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
74 */ 135 */
75void rcu_exit_nohz(void) 136void rcu_idle_exit(void)
76{ 137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
79 175
80#endif /* #ifdef CONFIG_NO_HZ */ 176#endif /* #ifdef CONFIG_PROVE_RCU */
177
178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -96,16 +202,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
96} 202}
97 203
98/* 204/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 205 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
110 * are at it, given that any rcu quiescent state is also an rcu_bh 206 * are at it, given that any rcu quiescent state is also an rcu_bh
111 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 207 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +213,7 @@ void rcu_sched_qs(int cpu)
117 local_irq_save(flags); 213 local_irq_save(flags);
118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 214 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
119 rcu_qsctr_help(&rcu_bh_ctrlblk)) 215 rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 216 invoke_rcu_callbacks();
121 local_irq_restore(flags); 217 local_irq_restore(flags);
122} 218}
123 219
@@ -130,20 +226,19 @@ void rcu_bh_qs(int cpu)
130 226
131 local_irq_save(flags); 227 local_irq_save(flags);
132 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 228 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
133 invoke_rcu_kthread(); 229 invoke_rcu_callbacks();
134 local_irq_restore(flags); 230 local_irq_restore(flags);
135} 231}
136 232
137/* 233/*
138 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
139 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
140 */ 238 */
141void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
142{ 240{
143 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
144 (idle_cpu(cpu) &&
145 !in_softirq() &&
146 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
147 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
148 else if (!in_softirq()) 243 else if (!in_softirq())
149 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,18 +249,27 @@ void rcu_check_callbacks(int cpu, int user)
154 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure 249 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
155 * whose grace period has elapsed. 250 * whose grace period has elapsed.
156 */ 251 */
157static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) 252static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
158{ 253{
254 char *rn = NULL;
159 struct rcu_head *next, *list; 255 struct rcu_head *next, *list;
160 unsigned long flags; 256 unsigned long flags;
161 RCU_TRACE(int cb_count = 0); 257 RCU_TRACE(int cb_count = 0);
162 258
163 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 260 if (&rcp->rcucblist == rcp->donetail) {
261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
165 return; 267 return;
268 }
166 269
167 /* Move the ready-to-invoke callbacks to a local list. */ 270 /* Move the ready-to-invoke callbacks to a local list. */
168 local_irq_save(flags); 271 local_irq_save(flags);
272 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
169 list = rcp->rcucblist; 273 list = rcp->rcucblist;
170 rcp->rcucblist = *rcp->donetail; 274 rcp->rcucblist = *rcp->donetail;
171 *rcp->donetail = NULL; 275 *rcp->donetail = NULL;
@@ -176,49 +280,28 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
176 local_irq_restore(flags); 280 local_irq_restore(flags);
177 281
178 /* Invoke the callbacks on the local list. */ 282 /* Invoke the callbacks on the local list. */
283 RCU_TRACE(rn = rcp->name);
179 while (list) { 284 while (list) {
180 next = list->next; 285 next = list->next;
181 prefetch(next); 286 prefetch(next);
182 debug_rcu_head_unqueue(list); 287 debug_rcu_head_unqueue(list);
183 local_bh_disable(); 288 local_bh_disable();
184 __rcu_reclaim(list); 289 __rcu_reclaim(rn, list);
185 local_bh_enable(); 290 local_bh_enable();
186 list = next; 291 list = next;
187 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
188 } 293 }
189 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
190} 298}
191 299
192/* 300static void rcu_process_callbacks(struct softirq_action *unused)
193 * This kthread invokes RCU callbacks whose grace periods have
194 * elapsed. It is awakened as needed, and takes the place of the
195 * RCU_SOFTIRQ that was used previously for this purpose.
196 * This is a kthread, but it is never stopped, at least not until
197 * the system goes down.
198 */
199static int rcu_kthread(void *arg)
200{ 301{
201 unsigned long work; 302 __rcu_process_callbacks(&rcu_sched_ctrlblk);
202 unsigned long morework; 303 __rcu_process_callbacks(&rcu_bh_ctrlblk);
203 unsigned long flags; 304 rcu_preempt_process_callbacks();
204
205 for (;;) {
206 wait_event_interruptible(rcu_kthread_wq,
207 have_rcu_kthread_work != 0);
208 morework = rcu_boost();
209 local_irq_save(flags);
210 work = have_rcu_kthread_work;
211 have_rcu_kthread_work = morework;
212 local_irq_restore(flags);
213 if (work) {
214 rcu_process_callbacks(&rcu_sched_ctrlblk);
215 rcu_process_callbacks(&rcu_bh_ctrlblk);
216 rcu_preempt_process_callbacks();
217 }
218 schedule_timeout_interruptible(1); /* Leave CPU for others. */
219 }
220
221 return 0; /* Not reached, but needed to shut gcc up. */
222} 305}
223 306
224/* 307/*
@@ -280,45 +363,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
280 __call_rcu(head, func, &rcu_bh_ctrlblk); 363 __call_rcu(head, func, &rcu_bh_ctrlblk);
281} 364}
282EXPORT_SYMBOL_GPL(call_rcu_bh); 365EXPORT_SYMBOL_GPL(call_rcu_bh);
283
284void rcu_barrier_bh(void)
285{
286 struct rcu_synchronize rcu;
287
288 init_rcu_head_on_stack(&rcu.head);
289 init_completion(&rcu.completion);
290 /* Will wake me after RCU finished. */
291 call_rcu_bh(&rcu.head, wakeme_after_rcu);
292 /* Wait for it. */
293 wait_for_completion(&rcu.completion);
294 destroy_rcu_head_on_stack(&rcu.head);
295}
296EXPORT_SYMBOL_GPL(rcu_barrier_bh);
297
298void rcu_barrier_sched(void)
299{
300 struct rcu_synchronize rcu;
301
302 init_rcu_head_on_stack(&rcu.head);
303 init_completion(&rcu.completion);
304 /* Will wake me after RCU finished. */
305 call_rcu_sched(&rcu.head, wakeme_after_rcu);
306 /* Wait for it. */
307 wait_for_completion(&rcu.completion);
308 destroy_rcu_head_on_stack(&rcu.head);
309}
310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
311
312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
316{
317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195f..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,32 +23,30 @@
23 */ 23 */
24 24
25#include <linux/kthread.h> 25#include <linux/kthread.h>
26#include <linux/module.h>
26#include <linux/debugfs.h> 27#include <linux/debugfs.h>
27#include <linux/seq_file.h> 28#include <linux/seq_file.h>
28 29
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */ 30/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk { 31struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 32 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(char *name); /* Name of RCU type. */
41}; 37};
42 38
43/* Definition for rcupdate control block. */ 39/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = { 40static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist, 41 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist, 42 .curtail = &rcu_sched_ctrlblk.rcucblist,
43 RCU_TRACE(.name = "rcu_sched")
47}; 44};
48 45
49static struct rcu_ctrlblk rcu_bh_ctrlblk = { 46static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist, 47 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist, 48 .curtail = &rcu_bh_ctrlblk.rcucblist,
49 RCU_TRACE(.name = "rcu_bh")
52}; 50};
53 51
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
131 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, 129 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
132 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, 130 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
133 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), 131 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
132 RCU_TRACE(.rcb.name = "rcu_preempt")
134}; 133};
135 134
136static int rcu_preempted_readers_exp(void); 135static int rcu_preempted_readers_exp(void);
@@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
247 246
248#include "rtmutex_common.h" 247#include "rtmutex_common.h"
249 248
249#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
250
251/* Controls for rcu_kthread() kthread. */
252static struct task_struct *rcu_kthread_task;
253static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
254static unsigned long have_rcu_kthread_work;
255
250/* 256/*
251 * Carry out RCU priority boosting on the task indicated by ->boost_tasks, 257 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
252 * and advance ->boost_tasks to the next task in the ->blkd_tasks list. 258 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -306,8 +312,8 @@ static int rcu_boost(void)
306 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
308 314
309 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
310 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
311} 317}
312 318
313/* 319/*
@@ -334,7 +340,7 @@ static int rcu_initiate_boost(void)
334 if (rcu_preempt_ctrlblk.exp_tasks == NULL) 340 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks = 341 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks; 342 rcu_preempt_ctrlblk.gp_tasks;
337 invoke_rcu_kthread(); 343 invoke_rcu_callbacks();
338 } else 344 } else
339 RCU_TRACE(rcu_initiate_boost_trace()); 345 RCU_TRACE(rcu_initiate_boost_trace());
340 return 1; 346 return 1;
@@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void)
353#else /* #ifdef CONFIG_RCU_BOOST */ 359#else /* #ifdef CONFIG_RCU_BOOST */
354 360
355/* 361/*
356 * If there is no RCU priority boosting, we don't boost.
357 */
358static int rcu_boost(void)
359{
360 return 0;
361}
362
363/*
364 * If there is no RCU priority boosting, we don't initiate boosting, 362 * If there is no RCU priority boosting, we don't initiate boosting,
365 * but we do indicate whether there are blocked readers blocking the 363 * but we do indicate whether there are blocked readers blocking the
366 * current grace period. 364 * current grace period.
@@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void)
427 425
428 /* If there are done callbacks, cause them to be invoked. */ 426 /* If there are done callbacks, cause them to be invoked. */
429 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 427 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
430 invoke_rcu_kthread(); 428 invoke_rcu_callbacks();
431} 429}
432 430
433/* 431/*
@@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void)
648 rcu_preempt_cpu_qs(); 646 rcu_preempt_cpu_qs();
649 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 647 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
650 rcu_preempt_ctrlblk.rcb.donetail) 648 rcu_preempt_ctrlblk.rcb.donetail)
651 invoke_rcu_kthread(); 649 invoke_rcu_callbacks();
652 if (rcu_preempt_gp_in_progress() && 650 if (rcu_preempt_gp_in_progress() &&
653 rcu_cpu_blocking_cur_gp() && 651 rcu_cpu_blocking_cur_gp() &&
654 rcu_preempt_running_reader()) 652 rcu_preempt_running_reader())
@@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
674 */ 672 */
675static void rcu_preempt_process_callbacks(void) 673static void rcu_preempt_process_callbacks(void)
676{ 674{
677 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 675 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
678} 676}
679 677
680/* 678/*
@@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
697} 695}
698EXPORT_SYMBOL_GPL(call_rcu); 696EXPORT_SYMBOL_GPL(call_rcu);
699 697
700void rcu_barrier(void)
701{
702 struct rcu_synchronize rcu;
703
704 init_rcu_head_on_stack(&rcu.head);
705 init_completion(&rcu.completion);
706 /* Will wake me after RCU finished. */
707 call_rcu(&rcu.head, wakeme_after_rcu);
708 /* Wait for it. */
709 wait_for_completion(&rcu.completion);
710 destroy_rcu_head_on_stack(&rcu.head);
711}
712EXPORT_SYMBOL_GPL(rcu_barrier);
713
714/* 698/*
715 * synchronize_rcu - wait until a grace period has elapsed. 699 * synchronize_rcu - wait until a grace period has elapsed.
716 * 700 *
@@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
864#endif /* #ifdef CONFIG_RCU_TRACE */ 848#endif /* #ifdef CONFIG_RCU_TRACE */
865 849
866/* 850/*
867 * Because preemptible RCU does not exist, it is never necessary to
868 * boost preempted RCU readers.
869 */
870static int rcu_boost(void)
871{
872 return 0;
873}
874
875/*
876 * Because preemptible RCU does not exist, it never has any callbacks 851 * Because preemptible RCU does not exist, it never has any callbacks
877 * to check. 852 * to check.
878 */ 853 */
@@ -898,6 +873,103 @@ static void rcu_preempt_process_callbacks(void)
898 873
899#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 874#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
900 875
876#ifdef CONFIG_RCU_BOOST
877
878/*
879 * Wake up rcu_kthread() to process callbacks now eligible for invocation
880 * or to boost readers.
881 */
882static void invoke_rcu_callbacks(void)
883{
884 have_rcu_kthread_work = 1;
885 wake_up(&rcu_kthread_wq);
886}
887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
901/*
902 * This kthread invokes RCU callbacks whose grace periods have
903 * elapsed. It is awakened as needed, and takes the place of the
904 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
905 * This is a kthread, but it is never stopped, at least not until
906 * the system goes down.
907 */
908static int rcu_kthread(void *arg)
909{
910 unsigned long work;
911 unsigned long morework;
912 unsigned long flags;
913
914 for (;;) {
915 wait_event_interruptible(rcu_kthread_wq,
916 have_rcu_kthread_work != 0);
917 morework = rcu_boost();
918 local_irq_save(flags);
919 work = have_rcu_kthread_work;
920 have_rcu_kthread_work = morework;
921 local_irq_restore(flags);
922 if (work)
923 rcu_process_callbacks(NULL);
924 schedule_timeout_interruptible(1); /* Leave CPU for others. */
925 }
926
927 return 0; /* Not reached, but needed to shut gcc up. */
928}
929
930/*
931 * Spawn the kthread that invokes RCU callbacks.
932 */
933static int __init rcu_spawn_kthreads(void)
934{
935 struct sched_param sp;
936
937 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
938 sp.sched_priority = RCU_BOOST_PRIO;
939 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
940 return 0;
941}
942early_initcall(rcu_spawn_kthreads);
943
944#else /* #ifdef CONFIG_RCU_BOOST */
945
946/*
947 * Start up softirq processing of callbacks.
948 */
949void invoke_rcu_callbacks(void)
950{
951 raise_softirq(RCU_SOFTIRQ);
952}
953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
966void rcu_init(void)
967{
968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
969}
970
971#endif /* #else #ifdef CONFIG_RCU_BOOST */
972
901#ifdef CONFIG_DEBUG_LOCK_ALLOC 973#ifdef CONFIG_DEBUG_LOCK_ALLOC
902#include <linux/kernel_stat.h> 974#include <linux/kernel_stat.h>
903 975
@@ -913,12 +985,6 @@ void __init rcu_scheduler_starting(void)
913 985
914#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 986#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
915 987
916#ifdef CONFIG_RCU_BOOST
917#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
918#else /* #ifdef CONFIG_RCU_BOOST */
919#define RCU_BOOST_PRIO 1
920#endif /* #else #ifdef CONFIG_RCU_BOOST */
921
922#ifdef CONFIG_RCU_TRACE 988#ifdef CONFIG_RCU_TRACE
923 989
924#ifdef CONFIG_RCU_BOOST 990#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7e..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -73,7 +75,7 @@ module_param(nreaders, int, 0444);
73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 75MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
74module_param(nfakewriters, int, 0444); 76module_param(nfakewriters, int, 0444);
75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 77MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
76module_param(stat_interval, int, 0444); 78module_param(stat_interval, int, 0644);
77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 79MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
78module_param(verbose, bool, 0444); 80module_param(verbose, bool, 0444);
79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 81MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -480,30 +500,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 500 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
481} 501}
482 502
483struct rcu_bh_torture_synchronize {
484 struct rcu_head head;
485 struct completion completion;
486};
487
488static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
489{
490 struct rcu_bh_torture_synchronize *rcu;
491
492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
493 complete(&rcu->completion);
494}
495
496static void rcu_bh_torture_synchronize(void)
497{
498 struct rcu_bh_torture_synchronize rcu;
499
500 init_rcu_head_on_stack(&rcu.head);
501 init_completion(&rcu.completion);
502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
503 wait_for_completion(&rcu.completion);
504 destroy_rcu_head_on_stack(&rcu.head);
505}
506
507static struct rcu_torture_ops rcu_bh_ops = { 503static struct rcu_torture_ops rcu_bh_ops = {
508 .init = NULL, 504 .init = NULL,
509 .cleanup = NULL, 505 .cleanup = NULL,
@@ -512,7 +508,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
512 .readunlock = rcu_bh_torture_read_unlock, 508 .readunlock = rcu_bh_torture_read_unlock,
513 .completed = rcu_bh_torture_completed, 509 .completed = rcu_bh_torture_completed,
514 .deferred_free = rcu_bh_torture_deferred_free, 510 .deferred_free = rcu_bh_torture_deferred_free,
515 .sync = rcu_bh_torture_synchronize, 511 .sync = synchronize_rcu_bh,
516 .cb_barrier = rcu_barrier_bh, 512 .cb_barrier = rcu_barrier_bh,
517 .fqs = rcu_bh_force_quiescent_state, 513 .fqs = rcu_bh_force_quiescent_state,
518 .stats = NULL, 514 .stats = NULL,
@@ -528,7 +524,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
528 .readunlock = rcu_bh_torture_read_unlock, 524 .readunlock = rcu_bh_torture_read_unlock,
529 .completed = rcu_bh_torture_completed, 525 .completed = rcu_bh_torture_completed,
530 .deferred_free = rcu_sync_torture_deferred_free, 526 .deferred_free = rcu_sync_torture_deferred_free,
531 .sync = rcu_bh_torture_synchronize, 527 .sync = synchronize_rcu_bh,
532 .cb_barrier = NULL, 528 .cb_barrier = NULL,
533 .fqs = rcu_bh_force_quiescent_state, 529 .fqs = rcu_bh_force_quiescent_state,
534 .stats = NULL, 530 .stats = NULL,
@@ -536,6 +532,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
536 .name = "rcu_bh_sync" 532 .name = "rcu_bh_sync"
537}; 533};
538 534
535static struct rcu_torture_ops rcu_bh_expedited_ops = {
536 .init = rcu_sync_torture_init,
537 .cleanup = NULL,
538 .readlock = rcu_bh_torture_read_lock,
539 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
540 .readunlock = rcu_bh_torture_read_unlock,
541 .completed = rcu_bh_torture_completed,
542 .deferred_free = rcu_sync_torture_deferred_free,
543 .sync = synchronize_rcu_bh_expedited,
544 .cb_barrier = NULL,
545 .fqs = rcu_bh_force_quiescent_state,
546 .stats = NULL,
547 .irq_capable = 1,
548 .name = "rcu_bh_expedited"
549};
550
539/* 551/*
540 * Definitions for srcu torture testing. 552 * Definitions for srcu torture testing.
541 */ 553 */
@@ -620,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
620 .name = "srcu" 632 .name = "srcu"
621}; 633};
622 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
623static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
624{ 660{
625 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -659,11 +695,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 695 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
660} 696}
661 697
662static void sched_torture_synchronize(void)
663{
664 synchronize_sched();
665}
666
667static struct rcu_torture_ops sched_ops = { 698static struct rcu_torture_ops sched_ops = {
668 .init = rcu_sync_torture_init, 699 .init = rcu_sync_torture_init,
669 .cleanup = NULL, 700 .cleanup = NULL,
@@ -672,7 +703,7 @@ static struct rcu_torture_ops sched_ops = {
672 .readunlock = sched_torture_read_unlock, 703 .readunlock = sched_torture_read_unlock,
673 .completed = rcu_no_completed, 704 .completed = rcu_no_completed,
674 .deferred_free = rcu_sched_torture_deferred_free, 705 .deferred_free = rcu_sched_torture_deferred_free,
675 .sync = sched_torture_synchronize, 706 .sync = synchronize_sched,
676 .cb_barrier = rcu_barrier_sched, 707 .cb_barrier = rcu_barrier_sched,
677 .fqs = rcu_sched_force_quiescent_state, 708 .fqs = rcu_sched_force_quiescent_state,
678 .stats = NULL, 709 .stats = NULL,
@@ -688,7 +719,7 @@ static struct rcu_torture_ops sched_sync_ops = {
688 .readunlock = sched_torture_read_unlock, 719 .readunlock = sched_torture_read_unlock,
689 .completed = rcu_no_completed, 720 .completed = rcu_no_completed,
690 .deferred_free = rcu_sync_torture_deferred_free, 721 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = sched_torture_synchronize, 722 .sync = synchronize_sched,
692 .cb_barrier = NULL, 723 .cb_barrier = NULL,
693 .fqs = rcu_sched_force_quiescent_state, 724 .fqs = rcu_sched_force_quiescent_state,
694 .stats = NULL, 725 .stats = NULL,
@@ -754,7 +785,7 @@ static int rcu_torture_boost(void *arg)
754 do { 785 do {
755 /* Wait for the next test interval. */ 786 /* Wait for the next test interval. */
756 oldstarttime = boost_starttime; 787 oldstarttime = boost_starttime;
757 while (jiffies - oldstarttime > ULONG_MAX / 2) { 788 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
758 schedule_timeout_uninterruptible(1); 789 schedule_timeout_uninterruptible(1);
759 rcu_stutter_wait("rcu_torture_boost"); 790 rcu_stutter_wait("rcu_torture_boost");
760 if (kthread_should_stop() || 791 if (kthread_should_stop() ||
@@ -765,7 +796,7 @@ static int rcu_torture_boost(void *arg)
765 /* Do one boost-test interval. */ 796 /* Do one boost-test interval. */
766 endtime = oldstarttime + test_boost_duration * HZ; 797 endtime = oldstarttime + test_boost_duration * HZ;
767 call_rcu_time = jiffies; 798 call_rcu_time = jiffies;
768 while (jiffies - endtime > ULONG_MAX / 2) { 799 while (ULONG_CMP_LT(jiffies, endtime)) {
769 /* If we don't have a callback in flight, post one. */ 800 /* If we don't have a callback in flight, post one. */
770 if (!rbi.inflight) { 801 if (!rbi.inflight) {
771 smp_mb(); /* RCU core before ->inflight = 1. */ 802 smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +823,8 @@ static int rcu_torture_boost(void *arg)
792 * interval. Besides, we are running at RT priority, 823 * interval. Besides, we are running at RT priority,
793 * so delays should be relatively rare. 824 * so delays should be relatively rare.
794 */ 825 */
795 while (oldstarttime == boost_starttime) { 826 while (oldstarttime == boost_starttime &&
827 !kthread_should_stop()) {
796 if (mutex_trylock(&boost_mutex)) { 828 if (mutex_trylock(&boost_mutex)) {
797 boost_starttime = jiffies + 829 boost_starttime = jiffies +
798 test_boost_interval * HZ; 830 test_boost_interval * HZ;
@@ -809,11 +841,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
809 841
810 /* Clean up and exit. */ 842 /* Clean up and exit. */
811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 843 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 844 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 845 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 846 schedule_timeout_uninterruptible(1);
816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 847 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
848 destroy_rcu_head_on_stack(&rbi.rcu);
817 return 0; 849 return 0;
818} 850}
819 851
@@ -831,11 +863,13 @@ rcu_torture_fqs(void *arg)
831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 863 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
832 do { 864 do {
833 fqs_resume_time = jiffies + fqs_stutter * HZ; 865 fqs_resume_time = jiffies + fqs_stutter * HZ;
834 while (jiffies - fqs_resume_time > LONG_MAX) { 866 while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
867 !kthread_should_stop()) {
835 schedule_timeout_interruptible(1); 868 schedule_timeout_interruptible(1);
836 } 869 }
837 fqs_burst_remaining = fqs_duration; 870 fqs_burst_remaining = fqs_duration;
838 while (fqs_burst_remaining > 0) { 871 while (fqs_burst_remaining > 0 &&
872 !kthread_should_stop()) {
839 cur_ops->fqs(); 873 cur_ops->fqs();
840 udelay(fqs_holdoff); 874 udelay(fqs_holdoff);
841 fqs_burst_remaining -= fqs_holdoff; 875 fqs_burst_remaining -= fqs_holdoff;
@@ -923,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
923 return 0; 957 return 0;
924} 958}
925 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
926/* 972/*
927 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
928 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -944,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
944 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
945 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
946 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
947 if (p == NULL) { 994 if (p == NULL) {
948 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
949 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -961,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
961 /* Should not happen, but... */ 1008 /* Should not happen, but... */
962 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
963 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
964 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
965 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
966 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1004,6 +1053,7 @@ rcu_torture_reader(void *arg)
1004 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
1005 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
1006 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1007 if (p == NULL) { 1057 if (p == NULL) {
1008 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
1009 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1019,6 +1069,8 @@ rcu_torture_reader(void *arg)
1019 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1020 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1021 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1022 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1023 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1024 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1066,7 +1118,8 @@ rcu_torture_printk(char *page)
1066 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1067 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1068 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1069 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1070 rcu_torture_current, 1123 rcu_torture_current,
1071 rcu_torture_current_version, 1124 rcu_torture_current_version,
1072 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1078,7 +1131,11 @@ rcu_torture_printk(char *page)
1078 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1079 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1080 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1081 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1082 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1083 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1084 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1242,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1242 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1243 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1244 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1245 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1246 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1247 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1248 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1249 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1250 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1251} 1310}
1252 1311
1253static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1280,8 +1339,9 @@ static int rcutorture_booster_init(int cpu)
1280 /* Don't allow time recalculation while creating a new task. */ 1339 /* Don't allow time recalculation while creating a new task. */
1281 mutex_lock(&boost_mutex); 1340 mutex_lock(&boost_mutex);
1282 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1341 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1283 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, 1342 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
1284 "rcu_torture_boost"); 1343 cpu_to_node(cpu),
1344 "rcu_torture_boost");
1285 if (IS_ERR(boost_tasks[cpu])) { 1345 if (IS_ERR(boost_tasks[cpu])) {
1286 retval = PTR_ERR(boost_tasks[cpu]); 1346 retval = PTR_ERR(boost_tasks[cpu]);
1287 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1347 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1296,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1296 return 0; 1356 return 0;
1297} 1357}
1298 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1299static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1300 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1301{ 1486{
@@ -1400,6 +1585,11 @@ rcu_torture_cleanup(void)
1400 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1401 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1402 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1403 1593
1404 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1405 1595
@@ -1424,8 +1614,8 @@ rcu_torture_init(void)
1424 int firsterr = 0; 1614 int firsterr = 0;
1425 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1426 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1427 &rcu_bh_ops, &rcu_bh_sync_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1428 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1429 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1430 1620
1431 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1616,6 +1806,18 @@ rcu_torture_init(void)
1616 } 1806 }
1617 } 1807 }
1618 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1619 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1620 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1621 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd3..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <linux/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/percpu.h> 44#include <linux/percpu.h>
@@ -52,13 +52,16 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53 53
54#include "rcutree.h" 54#include "rcutree.h"
55#include <trace/events/rcu.h>
56
57#include "rcu.h"
55 58
56/* Data structures. */ 59/* Data structures. */
57 60
58static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 61static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
59 62
60#define RCU_STATE_INITIALIZER(structname) { \ 63#define RCU_STATE_INITIALIZER(structname) { \
61 .level = { &structname.node[0] }, \ 64 .level = { &structname##_state.node[0] }, \
62 .levelcnt = { \ 65 .levelcnt = { \
63 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 66 NUM_RCU_LVL_0, /* root of hierarchy. */ \
64 NUM_RCU_LVL_1, \ 67 NUM_RCU_LVL_1, \
@@ -66,20 +69,20 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
67 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
68 }, \ 71 }, \
69 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
70 .gpnum = -300, \ 73 .gpnum = -300, \
71 .completed = -300, \ 74 .completed = -300, \
72 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 76 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
74 .n_force_qs = 0, \ 77 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 78 .n_force_qs_ngp = 0, \
76 .name = #structname, \ 79 .name = #structname, \
77} 80}
78 81
79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 82struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
80DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81 84
82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
84 87
85static struct rcu_state *rcu_state; 88static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void); 131static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 132static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130 133
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/* 134/*
134 * Track the rcutorture test sequence number and the update version 135 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented 136 * number within a given test. The rcutorture_testseq is incremented
@@ -156,44 +157,50 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
156 * Note a quiescent state. Because we do not need to know 157 * Note a quiescent state. Because we do not need to know
157 * how many quiescent states passed, just if there was at least 158 * how many quiescent states passed, just if there was at least
158 * one since the start of the grace period, this just sets a flag. 159 * one since the start of the grace period, this just sets a flag.
160 * The caller must have disabled preemption.
159 */ 161 */
160void rcu_sched_qs(int cpu) 162void rcu_sched_qs(int cpu)
161{ 163{
162 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 164 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
163 165
164 rdp->passed_quiesc_completed = rdp->gpnum - 1; 166 rdp->passed_quiesce_gpnum = rdp->gpnum;
165 barrier(); 167 barrier();
166 rdp->passed_quiesc = 1; 168 if (rdp->passed_quiesce == 0)
169 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
170 rdp->passed_quiesce = 1;
167} 171}
168 172
169void rcu_bh_qs(int cpu) 173void rcu_bh_qs(int cpu)
170{ 174{
171 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 175 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
172 176
173 rdp->passed_quiesc_completed = rdp->gpnum - 1; 177 rdp->passed_quiesce_gpnum = rdp->gpnum;
174 barrier(); 178 barrier();
175 rdp->passed_quiesc = 1; 179 if (rdp->passed_quiesce == 0)
180 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
181 rdp->passed_quiesce = 1;
176} 182}
177 183
178/* 184/*
179 * Note a context switch. This is a quiescent state for RCU-sched, 185 * Note a context switch. This is a quiescent state for RCU-sched,
180 * and requires special handling for preemptible RCU. 186 * and requires special handling for preemptible RCU.
187 * The caller must have disabled preemption.
181 */ 188 */
182void rcu_note_context_switch(int cpu) 189void rcu_note_context_switch(int cpu)
183{ 190{
191 trace_rcu_utilization("Start context switch");
184 rcu_sched_qs(cpu); 192 rcu_sched_qs(cpu);
185 rcu_preempt_note_context_switch(cpu); 193 rcu_preempt_note_context_switch(cpu);
194 trace_rcu_utilization("End context switch");
186} 195}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
188 197
189#ifdef CONFIG_NO_HZ
190DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
191 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
192 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
193}; 201};
194#endif /* #ifdef CONFIG_NO_HZ */
195 202
196static int blimit = 10; /* Maximum callbacks per softirq. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
197static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
198static int qlowmark = 100; /* Once only this many pending, use blimit. */ 205static int qlowmark = 100; /* Once only this many pending, use blimit. */
199 206
@@ -314,15 +321,16 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
314 * trust its state not to change because interrupts are disabled. 321 * trust its state not to change because interrupts are disabled.
315 */ 322 */
316 if (cpu_is_offline(rdp->cpu)) { 323 if (cpu_is_offline(rdp->cpu)) {
324 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
317 rdp->offline_fqs++; 325 rdp->offline_fqs++;
318 return 1; 326 return 1;
319 } 327 }
320 328
321 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
322 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
323 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
324 332 * where idle loops fail to inform RCU about the CPU being idle.
325 /* The CPU is online, so send it a reschedule IPI. */ 333 */
326 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
327 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
328 else 336 else
@@ -333,64 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
333 341
334#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
335 343
336#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
337 370
338/** 371/**
339 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
340 * 373 *
341 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
342 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
343 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
344 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
345 */ 382 */
346void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
347{ 384{
348 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
349 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
350 388
351 local_irq_save(flags); 389 local_irq_save(flags);
352 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
353 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
354 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
355 return; 393 rcu_idle_enter_common(rdtp, oldval);
356 }
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
362 local_irq_restore(flags); 394 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
370} 395}
371 396
372/* 397/**
373 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
403 *
404 * This code assumes that the idle loop never does anything that might
405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
374 * 408 *
375 * Exit nohz mode, in other words, -enter- the mode in which RCU 409 * Use things like work queues to work around this limitation.
376 * read-side critical sections normally occur. 410 *
411 * You have been warned.
377 */ 412 */
378void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
379{ 414{
380 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
381 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
382 418
383 local_irq_save(flags); 419 local_irq_save(flags);
384 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
385 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
386 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
387 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
388 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
389 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
390 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
394 local_irq_restore(flags); 519 local_irq_restore(flags);
395} 520}
396 521
@@ -437,27 +562,37 @@ void rcu_nmi_exit(void)
437 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
438} 563}
439 564
565#ifdef CONFIG_PROVE_RCU
566
440/** 567/**
441 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
442 * 569 *
443 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
444 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
445 */ 572 */
446void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
447{ 574{
448 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
449} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
450 585
451/** 586/**
452 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
453 * 588 *
454 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
455 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
456 * with no ticks. 591 * disabled preemption.
457 */ 592 */
458void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
459{ 594{
460 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
461} 596}
462 597
463#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -470,7 +605,7 @@ void rcu_irq_exit(void)
470static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
471{ 606{
472 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
473 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
474} 609}
475 610
476/* 611/*
@@ -481,11 +616,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
481 */ 616 */
482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 617static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
483{ 618{
484 unsigned long curr; 619 unsigned int curr;
485 unsigned long snap; 620 unsigned int snap;
486 621
487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); 622 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
488 snap = (unsigned long)rdp->dynticks_snap; 623 snap = (unsigned int)rdp->dynticks_snap;
489 624
490 /* 625 /*
491 * If the CPU passed through or entered a dynticks idle phase with 626 * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +630,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
495 * read-side critical section that started before the beginning 630 * read-side critical section that started before the beginning
496 * of the current RCU grace period. 631 * of the current RCU grace period.
497 */ 632 */
498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { 633 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
634 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
499 rdp->dynticks_fqs++; 635 rdp->dynticks_fqs++;
500 return 1; 636 return 1;
501 } 637 }
@@ -506,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
506 642
507#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
508 644
509#else /* #ifdef CONFIG_NO_HZ */
510
511#ifdef CONFIG_SMP
512
513static int dyntick_save_progress_counter(struct rcu_data *rdp)
514{
515 return 0;
516}
517
518static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
519{
520 return rcu_implicit_offline_qs(rdp);
521}
522
523#endif /* #ifdef CONFIG_SMP */
524
525#endif /* #else #ifdef CONFIG_NO_HZ */
526
527int rcu_cpu_stall_suppress __read_mostly;
528
529static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
530{ 646{
531 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -537,6 +653,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
537 int cpu; 653 int cpu;
538 long delta; 654 long delta;
539 unsigned long flags; 655 unsigned long flags;
656 int ndetected;
540 struct rcu_node *rnp = rcu_get_root(rsp); 657 struct rcu_node *rnp = rcu_get_root(rsp);
541 658
542 /* Only let one CPU complain about others per time interval. */ 659 /* Only let one CPU complain about others per time interval. */
@@ -553,7 +670,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
553 * Now rat on any tasks that got kicked up to the root rcu_node 670 * Now rat on any tasks that got kicked up to the root rcu_node
554 * due to CPU offlining. 671 * due to CPU offlining.
555 */ 672 */
556 rcu_print_task_stall(rnp); 673 ndetected = rcu_print_task_stall(rnp);
557 raw_spin_unlock_irqrestore(&rnp->lock, flags); 674 raw_spin_unlock_irqrestore(&rnp->lock, flags);
558 675
559 /* 676 /*
@@ -565,17 +682,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
565 rsp->name); 682 rsp->name);
566 rcu_for_each_leaf_node(rsp, rnp) { 683 rcu_for_each_leaf_node(rsp, rnp) {
567 raw_spin_lock_irqsave(&rnp->lock, flags); 684 raw_spin_lock_irqsave(&rnp->lock, flags);
568 rcu_print_task_stall(rnp); 685 ndetected += rcu_print_task_stall(rnp);
569 raw_spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
570 if (rnp->qsmask == 0) 687 if (rnp->qsmask == 0)
571 continue; 688 continue;
572 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 689 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
573 if (rnp->qsmask & (1UL << cpu)) 690 if (rnp->qsmask & (1UL << cpu)) {
574 printk(" %d", rnp->grplo + cpu); 691 printk(" %d", rnp->grplo + cpu);
692 ndetected++;
693 }
575 } 694 }
576 printk("} (detected by %d, t=%ld jiffies)\n", 695 printk("} (detected by %d, t=%ld jiffies)\n",
577 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 696 smp_processor_id(), (long)(jiffies - rsp->gp_start));
578 trigger_all_cpu_backtrace(); 697 if (ndetected == 0)
698 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
699 else if (!trigger_all_cpu_backtrace())
700 dump_stack();
579 701
580 /* If so configured, complain about tasks blocking the grace period. */ 702 /* If so configured, complain about tasks blocking the grace period. */
581 703
@@ -596,7 +718,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
596 */ 718 */
597 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 719 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
598 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 720 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
599 trigger_all_cpu_backtrace(); 721 if (!trigger_all_cpu_backtrace())
722 dump_stack();
600 723
601 raw_spin_lock_irqsave(&rnp->lock, flags); 724 raw_spin_lock_irqsave(&rnp->lock, flags);
602 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 725 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +801,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
678 * go looking for one. 801 * go looking for one.
679 */ 802 */
680 rdp->gpnum = rnp->gpnum; 803 rdp->gpnum = rnp->gpnum;
804 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
681 if (rnp->qsmask & rdp->grpmask) { 805 if (rnp->qsmask & rdp->grpmask) {
682 rdp->qs_pending = 1; 806 rdp->qs_pending = 1;
683 rdp->passed_quiesc = 0; 807 rdp->passed_quiesce = 0;
684 } else 808 } else
685 rdp->qs_pending = 0; 809 rdp->qs_pending = 0;
686 } 810 }
@@ -741,6 +865,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
741 865
742 /* Remember that we saw this grace-period completion. */ 866 /* Remember that we saw this grace-period completion. */
743 rdp->completed = rnp->completed; 867 rdp->completed = rnp->completed;
868 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
744 869
745 /* 870 /*
746 * If we were in an extended quiescent state, we may have 871 * If we were in an extended quiescent state, we may have
@@ -826,33 +951,33 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
826 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 951 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
827 struct rcu_node *rnp = rcu_get_root(rsp); 952 struct rcu_node *rnp = rcu_get_root(rsp);
828 953
829 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 954 if (!rcu_scheduler_fully_active ||
830 if (cpu_needs_another_gp(rsp, rdp)) 955 !cpu_needs_another_gp(rsp, rdp)) {
831 rsp->fqs_need_gp = 1; 956 /*
832 if (rnp->completed == rsp->completed) { 957 * Either the scheduler hasn't yet spawned the first
833 raw_spin_unlock_irqrestore(&rnp->lock, flags); 958 * non-idle task or this CPU does not need another
834 return; 959 * grace period. Either way, don't start a new grace
835 } 960 * period.
836 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 961 */
962 raw_spin_unlock_irqrestore(&rnp->lock, flags);
963 return;
964 }
837 965
966 if (rsp->fqs_active) {
838 /* 967 /*
839 * Propagate new ->completed value to rcu_node structures 968 * This CPU needs a grace period, but force_quiescent_state()
840 * so that other CPUs don't have to wait until the start 969 * is running. Tell it to start one on this CPU's behalf.
841 * of the next grace period to process their callbacks.
842 */ 970 */
843 rcu_for_each_node_breadth_first(rsp, rnp) { 971 rsp->fqs_need_gp = 1;
844 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 972 raw_spin_unlock_irqrestore(&rnp->lock, flags);
845 rnp->completed = rsp->completed;
846 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
847 }
848 local_irq_restore(flags);
849 return; 973 return;
850 } 974 }
851 975
852 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
853 rsp->gpnum++; 977 rsp->gpnum++;
854 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
855 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
856 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
857 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
858 983
@@ -862,9 +987,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
862 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
863 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
864 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
866 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
994 rnp->level, rnp->grplo,
995 rnp->grphi, rnp->qsmask);
868 raw_spin_unlock_irqrestore(&rnp->lock, flags); 996 raw_spin_unlock_irqrestore(&rnp->lock, flags);
869 return; 997 return;
870 } 998 }
@@ -901,12 +1029,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
901 if (rnp == rdp->mynode) 1029 if (rnp == rdp->mynode)
902 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1030 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp); 1031 rcu_preempt_boost_start_gp(rnp);
1032 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1033 rnp->level, rnp->grplo,
1034 rnp->grphi, rnp->qsmask);
904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1035 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
905 } 1036 }
906 1037
907 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
908 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
909 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
910 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
911 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
912} 1043}
@@ -922,6 +1053,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
922 __releases(rcu_get_root(rsp)->lock) 1053 __releases(rcu_get_root(rsp)->lock)
923{ 1054{
924 unsigned long gp_duration; 1055 unsigned long gp_duration;
1056 struct rcu_node *rnp = rcu_get_root(rsp);
1057 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
925 1058
926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1059 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927 1060
@@ -933,8 +1066,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
933 gp_duration = jiffies - rsp->gp_start; 1066 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max) 1067 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration; 1068 rsp->gp_max = gp_duration;
936 rsp->completed = rsp->gpnum; 1069
937 rsp->signaled = RCU_GP_IDLE; 1070 /*
1071 * We know the grace period is complete, but to everyone else
1072 * it appears to still be ongoing. But it is also the case
1073 * that to everyone else it looks like there is nothing that
1074 * they can do to advance the grace period. It is therefore
1075 * safe for us to drop the lock in order to mark the grace
1076 * period as completed in all of the rcu_node structures.
1077 *
1078 * But if this CPU needs another grace period, it will take
1079 * care of this while initializing the next grace period.
1080 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
1081 * because the callbacks have not yet been advanced: Those
1082 * callbacks are waiting on the grace period that just now
1083 * completed.
1084 */
1085 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
1086 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1087
1088 /*
1089 * Propagate new ->completed value to rcu_node structures
1090 * so that other CPUs don't have to wait until the start
1091 * of the next grace period to process their callbacks.
1092 */
1093 rcu_for_each_node_breadth_first(rsp, rnp) {
1094 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1095 rnp->completed = rsp->gpnum;
1096 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1097 }
1098 rnp = rcu_get_root(rsp);
1099 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1100 }
1101
1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1104 rsp->fqs_state = RCU_GP_IDLE;
938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
939} 1106}
940 1107
@@ -962,6 +1129,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
962 return; 1129 return;
963 } 1130 }
964 rnp->qsmask &= ~mask; 1131 rnp->qsmask &= ~mask;
1132 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
1133 mask, rnp->qsmask, rnp->level,
1134 rnp->grplo, rnp->grphi,
1135 !!rnp->gp_tasks);
965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 1136 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
966 1137
967 /* Other bits still set at this level, so done. */ 1138 /* Other bits still set at this level, so done. */
@@ -1000,7 +1171,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1000 * based on quiescent states detected in an earlier grace period! 1171 * based on quiescent states detected in an earlier grace period!
1001 */ 1172 */
1002static void 1173static void
1003rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 1174rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
1004{ 1175{
1005 unsigned long flags; 1176 unsigned long flags;
1006 unsigned long mask; 1177 unsigned long mask;
@@ -1008,17 +1179,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1008 1179
1009 rnp = rdp->mynode; 1180 rnp = rdp->mynode;
1010 raw_spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1011 if (lastcomp != rnp->completed) { 1182 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
1012 1183
1013 /* 1184 /*
1014 * Someone beat us to it for this grace period, so leave. 1185 * The grace period in which this quiescent state was
1015 * The race with GP start is resolved by the fact that we 1186 * recorded has ended, so don't report it upwards.
1016 * hold the leaf rcu_node lock, so that the per-CPU bits 1187 * We will instead need a new quiescent state that lies
1017 * cannot yet be initialized -- so we would simply find our 1188 * within the current grace period.
1018 * CPU's bit already cleared in rcu_report_qs_rnp() if this
1019 * race occurred.
1020 */ 1189 */
1021 rdp->passed_quiesc = 0; /* try again later! */ 1190 rdp->passed_quiesce = 0; /* need qs for new gp. */
1022 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1191 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1023 return; 1192 return;
1024 } 1193 }
@@ -1062,14 +1231,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1062 * Was there a quiescent state since the beginning of the grace 1231 * Was there a quiescent state since the beginning of the grace
1063 * period? If no, then exit and wait for the next call. 1232 * period? If no, then exit and wait for the next call.
1064 */ 1233 */
1065 if (!rdp->passed_quiesc) 1234 if (!rdp->passed_quiesce)
1066 return; 1235 return;
1067 1236
1068 /* 1237 /*
1069 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1238 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1070 * judge of that). 1239 * judge of that).
1071 */ 1240 */
1072 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 1241 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
1073} 1242}
1074 1243
1075#ifdef CONFIG_HOTPLUG_CPU 1244#ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1299,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1130 if (rnp->qsmaskinit != 0) { 1299 if (rnp->qsmaskinit != 0) {
1131 if (rnp != rdp->mynode) 1300 if (rnp != rdp->mynode)
1132 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1301 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1302 else
1303 trace_rcu_grace_period(rsp->name,
1304 rnp->gpnum + 1 -
1305 !!(rnp->qsmask & mask),
1306 "cpuofl");
1133 break; 1307 break;
1134 } 1308 }
1135 if (rnp == rdp->mynode) 1309 if (rnp == rdp->mynode) {
1310 trace_rcu_grace_period(rsp->name,
1311 rnp->gpnum + 1 -
1312 !!(rnp->qsmask & mask),
1313 "cpuofl");
1136 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1314 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1137 else 1315 } else
1138 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1316 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1139 mask = rnp->grpmask; 1317 mask = rnp->grpmask;
1140 rnp = rnp->parent; 1318 rnp = rnp->parent;
@@ -1153,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1153 else 1331 else
1154 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1155 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1156 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1157 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1158} 1336}
1159 1337
@@ -1190,17 +1368,24 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1190{ 1368{
1191 unsigned long flags; 1369 unsigned long flags;
1192 struct rcu_head *next, *list, **tail; 1370 struct rcu_head *next, *list, **tail;
1193 int count; 1371 int bl, count;
1194 1372
1195 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1196 if (!cpu_has_callbacks_ready_to_invoke(rdp)) 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1375 trace_rcu_batch_start(rsp->name, 0, 0);
1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1197 return; 1379 return;
1380 }
1198 1381
1199 /* 1382 /*
1200 * Extract the list of ready callbacks, disabling to prevent 1383 * Extract the list of ready callbacks, disabling to prevent
1201 * races with call_rcu() from interrupt handlers. 1384 * races with call_rcu() from interrupt handlers.
1202 */ 1385 */
1203 local_irq_save(flags); 1386 local_irq_save(flags);
1387 bl = rdp->blimit;
1388 trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
1204 list = rdp->nxtlist; 1389 list = rdp->nxtlist;
1205 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1390 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1206 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1391 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1401,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1216 next = list->next; 1401 next = list->next;
1217 prefetch(next); 1402 prefetch(next);
1218 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1219 __rcu_reclaim(list); 1404 __rcu_reclaim(rsp->name, list);
1220 list = next; 1405 list = next;
1221 if (++count >= rdp->blimit) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1222 break; 1410 break;
1223 } 1411 }
1224 1412
1225 local_irq_save(flags); 1413 local_irq_save(flags);
1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1226 1417
1227 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1228 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1250,7 +1441,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1250 1441
1251 local_irq_restore(flags); 1442 local_irq_restore(flags);
1252 1443
1253 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1444 /* Re-invoke RCU core processing if there are callbacks remaining. */
1254 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1445 if (cpu_has_callbacks_ready_to_invoke(rdp))
1255 invoke_rcu_core(); 1446 invoke_rcu_core();
1256} 1447}
@@ -1258,17 +1449,16 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1258/* 1449/*
1259 * Check to see if this CPU is in a non-context-switch quiescent state 1450 * Check to see if this CPU is in a non-context-switch quiescent state
1260 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1261 * Also schedule the RCU softirq handler. 1452 * Also schedule RCU core processing.
1262 * 1453 *
1263 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1264 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1265 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1266 */ 1457 */
1267void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1268{ 1459{
1269 if (user || 1460 trace_rcu_utilization("Start scheduler-tick");
1270 (idle_cpu(cpu) && rcu_scheduler_active && 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1271 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1272 1462
1273 /* 1463 /*
1274 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1299,6 +1489,7 @@ void rcu_check_callbacks(int cpu, int user)
1299 rcu_preempt_check_callbacks(cpu); 1489 rcu_preempt_check_callbacks(cpu);
1300 if (rcu_pending(cpu)) 1490 if (rcu_pending(cpu))
1301 invoke_rcu_core(); 1491 invoke_rcu_core();
1492 trace_rcu_utilization("End scheduler-tick");
1302} 1493}
1303 1494
1304#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
@@ -1360,10 +1551,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1360 unsigned long flags; 1551 unsigned long flags;
1361 struct rcu_node *rnp = rcu_get_root(rsp); 1552 struct rcu_node *rnp = rcu_get_root(rsp);
1362 1553
1363 if (!rcu_gp_in_progress(rsp)) 1554 trace_rcu_utilization("Start fqs");
1555 if (!rcu_gp_in_progress(rsp)) {
1556 trace_rcu_utilization("End fqs");
1364 return; /* No grace period in progress, nothing to force. */ 1557 return; /* No grace period in progress, nothing to force. */
1558 }
1365 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1559 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1366 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1560 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1561 trace_rcu_utilization("End fqs");
1367 return; /* Someone else is already on the job. */ 1562 return; /* Someone else is already on the job. */
1368 } 1563 }
1369 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1564 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1377,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1377 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1378 } 1573 }
1379 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1380 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1381 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1382 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1383 1578
@@ -1393,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1393 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1394 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1395 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1396 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1397 break; 1592 break;
1398 1593
1399 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1412,11 +1607,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1412 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1607 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1413 rsp->fqs_need_gp = 0; 1608 rsp->fqs_need_gp = 0;
1414 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1609 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1610 trace_rcu_utilization("End fqs");
1415 return; 1611 return;
1416 } 1612 }
1417 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1613 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1418unlock_fqs_ret: 1614unlock_fqs_ret:
1419 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1615 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1616 trace_rcu_utilization("End fqs");
1420} 1617}
1421 1618
1422#else /* #ifdef CONFIG_SMP */ 1619#else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1626,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1429#endif /* #else #ifdef CONFIG_SMP */ 1626#endif /* #else #ifdef CONFIG_SMP */
1430 1627
1431/* 1628/*
1432 * This does the RCU processing work from softirq context for the 1629 * This does the RCU core processing work for the specified rcu_state
1433 * specified rcu_state and rcu_data structures. This may be called 1630 * and rcu_data structures. This may be called only from the CPU to
1434 * only from the CPU to whom the rdp belongs. 1631 * whom the rdp belongs.
1435 */ 1632 */
1436static void 1633static void
1437__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1634__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1665,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1468} 1665}
1469 1666
1470/* 1667/*
1471 * Do softirq processing for the current CPU. 1668 * Do RCU core processing for the current CPU.
1472 */ 1669 */
1473static void rcu_process_callbacks(struct softirq_action *unused) 1670static void rcu_process_callbacks(struct softirq_action *unused)
1474{ 1671{
1672 trace_rcu_utilization("Start RCU core");
1475 __rcu_process_callbacks(&rcu_sched_state, 1673 __rcu_process_callbacks(&rcu_sched_state,
1476 &__get_cpu_var(rcu_sched_data)); 1674 &__get_cpu_var(rcu_sched_data));
1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1675 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1478 rcu_preempt_process_callbacks(); 1676 rcu_preempt_process_callbacks();
1479 1677 trace_rcu_utilization("End RCU core");
1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1481 rcu_needs_cpu_flush();
1482} 1678}
1483 1679
1484/* 1680/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq() 1681 * Schedule RCU callback invocation. If the specified type of RCU
1486 * in earlier versions of RCU. Note that because we are running on 1682 * does not support RCU priority boosting, just do a direct call,
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task 1683 * otherwise wake up the per-CPU kernel kthread. Note that because we
1488 * cannot disappear out from under us. 1684 * are running on the current CPU with interrupts disabled, the
1685 * rcu_cpu_kthread_task cannot disappear out from under us.
1489 */ 1686 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1687static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{ 1688{
@@ -1530,6 +1727,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1727 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++; 1728 rdp->qlen++;
1532 1729
1730 if (__is_kfree_rcu_offset((unsigned long)func))
1731 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1732 rdp->qlen);
1733 else
1734 trace_rcu_callback(rsp->name, head, rdp->qlen);
1735
1533 /* If interrupts were disabled, don't dive into RCU core. */ 1736 /* If interrupts were disabled, don't dive into RCU core. */
1534 if (irqs_disabled_flags(flags)) { 1737 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags); 1738 local_irq_restore(flags);
@@ -1613,18 +1816,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1613 */ 1816 */
1614void synchronize_sched(void) 1817void synchronize_sched(void)
1615{ 1818{
1616 struct rcu_synchronize rcu;
1617
1618 if (rcu_blocking_is_gp()) 1819 if (rcu_blocking_is_gp())
1619 return; 1820 return;
1620 1821 wait_rcu_gp(call_rcu_sched);
1621 init_rcu_head_on_stack(&rcu.head);
1622 init_completion(&rcu.completion);
1623 /* Will wake me after RCU finished. */
1624 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1625 /* Wait for it. */
1626 wait_for_completion(&rcu.completion);
1627 destroy_rcu_head_on_stack(&rcu.head);
1628} 1822}
1629EXPORT_SYMBOL_GPL(synchronize_sched); 1823EXPORT_SYMBOL_GPL(synchronize_sched);
1630 1824
@@ -1639,18 +1833,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1639 */ 1833 */
1640void synchronize_rcu_bh(void) 1834void synchronize_rcu_bh(void)
1641{ 1835{
1642 struct rcu_synchronize rcu;
1643
1644 if (rcu_blocking_is_gp()) 1836 if (rcu_blocking_is_gp())
1645 return; 1837 return;
1646 1838 wait_rcu_gp(call_rcu_bh);
1647 init_rcu_head_on_stack(&rcu.head);
1648 init_completion(&rcu.completion);
1649 /* Will wake me after RCU finished. */
1650 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1651 /* Wait for it. */
1652 wait_for_completion(&rcu.completion);
1653 destroy_rcu_head_on_stack(&rcu.head);
1654} 1839}
1655EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1840EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1656 1841
@@ -1671,7 +1856,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1671 check_cpu_stall(rsp, rdp); 1856 check_cpu_stall(rsp, rdp);
1672 1857
1673 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1858 /* Is the RCU core waiting for a quiescent state from this CPU? */
1674 if (rdp->qs_pending && !rdp->passed_quiesc) { 1859 if (rcu_scheduler_fully_active &&
1860 rdp->qs_pending && !rdp->passed_quiesce) {
1675 1861
1676 /* 1862 /*
1677 * If force_quiescent_state() coming soon and this CPU 1863 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1869,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1869 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1684 jiffies)) 1870 jiffies))
1685 set_need_resched(); 1871 set_need_resched();
1686 } else if (rdp->qs_pending && rdp->passed_quiesc) { 1872 } else if (rdp->qs_pending && rdp->passed_quiesce) {
1687 rdp->n_rp_report_qs++; 1873 rdp->n_rp_report_qs++;
1688 return 1; 1874 return 1;
1689 } 1875 }
@@ -1741,7 +1927,7 @@ static int rcu_pending(int cpu)
1741 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1742 * 1 if so. 1928 * 1 if so.
1743 */ 1929 */
1744static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1745{ 1931{
1746 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1747 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1842,10 +2028,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1842 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1843 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1844 rdp->qlen = 0; 2030 rdp->qlen = 0;
1845#ifdef CONFIG_NO_HZ
1846 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1847#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1848 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
2035 rdp->rsp = rsp;
1849 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1850} 2037}
1851 2038
@@ -1865,13 +2052,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1865 2052
1866 /* Set up local state, ensuring consistent view of global state. */ 2053 /* Set up local state, ensuring consistent view of global state. */
1867 raw_spin_lock_irqsave(&rnp->lock, flags); 2054 raw_spin_lock_irqsave(&rnp->lock, flags);
1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1870 rdp->beenonline = 1; /* We have now been online. */ 2055 rdp->beenonline = 1; /* We have now been online. */
1871 rdp->preemptible = preemptible; 2056 rdp->preemptible = preemptible;
1872 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1873 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1874 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1875 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1876 2065
1877 /* 2066 /*
@@ -1891,9 +2080,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1891 rnp->qsmaskinit |= mask; 2080 rnp->qsmaskinit |= mask;
1892 mask = rnp->grpmask; 2081 mask = rnp->grpmask;
1893 if (rnp == rdp->mynode) { 2082 if (rnp == rdp->mynode) {
1894 rdp->gpnum = rnp->completed; /* if GP in progress... */ 2083 /*
2084 * If there is a grace period in progress, we will
2085 * set up to wait for it next time we run the
2086 * RCU core code.
2087 */
2088 rdp->gpnum = rnp->completed;
1895 rdp->completed = rnp->completed; 2089 rdp->completed = rnp->completed;
1896 rdp->passed_quiesc_completed = rnp->completed - 1; 2090 rdp->passed_quiesce = 0;
2091 rdp->qs_pending = 0;
2092 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
2093 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
1897 } 2094 }
1898 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2095 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1899 rnp = rnp->parent; 2096 rnp = rnp->parent;
@@ -1919,6 +2116,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2116 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode; 2117 struct rcu_node *rnp = rdp->mynode;
1921 2118
2119 trace_rcu_utilization("Start CPU hotplug");
1922 switch (action) { 2120 switch (action) {
1923 case CPU_UP_PREPARE: 2121 case CPU_UP_PREPARE:
1924 case CPU_UP_PREPARE_FROZEN: 2122 case CPU_UP_PREPARE_FROZEN:
@@ -1944,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1944 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
1945 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
1946 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
1947 break; 2146 break;
1948 case CPU_DEAD: 2147 case CPU_DEAD:
1949 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
@@ -1954,6 +2153,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1954 default: 2153 default:
1955 break; 2154 break;
1956 } 2155 }
2156 trace_rcu_utilization("End CPU hotplug");
1957 return NOTIFY_OK; 2157 return NOTIFY_OK;
1958} 2158}
1959 2159
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26fb..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -230,9 +231,9 @@ struct rcu_data {
230 /* in order to detect GP end. */ 231 /* in order to detect GP end. */
231 unsigned long gpnum; /* Highest gp number that this CPU */ 232 unsigned long gpnum; /* Highest gp number that this CPU */
232 /* is aware of having started. */ 233 /* is aware of having started. */
233 unsigned long passed_quiesc_completed; 234 unsigned long passed_quiesce_gpnum;
234 /* Value of completed at time of qs. */ 235 /* gpnum at time of quiescent state. */
235 bool passed_quiesc; /* User-mode/idle loop etc. */ 236 bool passed_quiesce; /* User-mode/idle loop etc. */
236 bool qs_pending; /* Core waits for quiesc state. */ 237 bool qs_pending; /* Core waits for quiesc state. */
237 bool beenonline; /* CPU online at least once. */ 238 bool beenonline; /* CPU online at least once. */
238 bool preemptible; /* Preemptible RCU? */ 239 bool preemptible; /* Preemptible RCU? */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -299,18 +296,15 @@ struct rcu_data {
299 unsigned long n_rp_need_nothing; 296 unsigned long n_rp_need_nothing;
300 297
301 int cpu; 298 int cpu;
299 struct rcu_state *rsp;
302}; 300};
303 301
304/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
305#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
306#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
307#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
308#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
309#ifdef CONFIG_NO_HZ
310#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
311#else /* #ifdef CONFIG_NO_HZ */
312#define RCU_SIGNAL_INIT RCU_FORCE_QS
313#endif /* #else #ifdef CONFIG_NO_HZ */
314 308
315#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
316 310
@@ -360,7 +354,7 @@ struct rcu_state {
360 354
361 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
362 356
363 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
364 /* Force QS state. */ 358 /* Force QS state. */
365 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
366 /* is running. */ 360 /* is running. */
@@ -417,6 +411,13 @@ extern struct rcu_state rcu_preempt_state;
417DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 411DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
418#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 412#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
419 413
414#ifdef CONFIG_RCU_BOOST
415DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
416DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
417DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
418DECLARE_PER_CPU(char, rcu_cpu_has_work);
419#endif /* #ifdef CONFIG_RCU_BOOST */
420
420#ifndef RCU_TREE_NONCORE 421#ifndef RCU_TREE_NONCORE
421 422
422/* Forward declarations for rcutree_plugin.h */ 423/* Forward declarations for rcutree_plugin.h */
@@ -430,7 +431,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
430static void rcu_stop_cpu_kthread(int cpu); 431static void rcu_stop_cpu_kthread(int cpu);
431#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 432#endif /* #ifdef CONFIG_HOTPLUG_CPU */
432static void rcu_print_detail_task_stall(struct rcu_state *rsp); 433static void rcu_print_detail_task_stall(struct rcu_state *rsp);
433static void rcu_print_task_stall(struct rcu_node *rnp); 434static int rcu_print_task_stall(struct rcu_node *rnp);
434static void rcu_preempt_stall_reset(void); 435static void rcu_preempt_stall_reset(void);
435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 436static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
436#ifdef CONFIG_HOTPLUG_CPU 437#ifdef CONFIG_HOTPLUG_CPU
@@ -443,17 +444,18 @@ static void rcu_preempt_check_callbacks(int cpu);
443static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
444void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
445#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
446static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
447#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
448static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
449static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
450static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 452static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
451static void rcu_preempt_send_cbs_to_online(void); 453static void rcu_preempt_send_cbs_to_online(void);
452static void __init __rcu_init_preempt(void); 454static void __init __rcu_init_preempt(void);
453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
457#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
458static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
459static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -466,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
466#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
467static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
468static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
469 474
470#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b0..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h> 28#include <linux/stop_machine.h>
29 29
30#define RCU_KTHREAD_PRIO 1
31
32#ifdef CONFIG_RCU_BOOST
33#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
34#else
35#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
36#endif
37
30/* 38/*
31 * Check the RCU kernel configuration parameters and print informative 39 * Check the RCU kernel configuration parameters and print informative
32 * messages about anything out of the ordinary. If you like #ifdef, you 40 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
64 72
65#ifdef CONFIG_TREE_PREEMPT_RCU 73#ifdef CONFIG_TREE_PREEMPT_RCU
66 74
67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 75struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 76DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state; 77static struct rcu_state *rcu_state = &rcu_preempt_state;
70 78
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
122{ 130{
123 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 131 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
124 132
125 rdp->passed_quiesc_completed = rdp->gpnum - 1; 133 rdp->passed_quiesce_gpnum = rdp->gpnum;
126 barrier(); 134 barrier();
127 rdp->passed_quiesc = 1; 135 if (rdp->passed_quiesce == 0)
136 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
137 rdp->passed_quiesce = 1;
128 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 138 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
129} 139}
130 140
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
190 if (rnp->qsmask & rdp->grpmask) 200 if (rnp->qsmask & rdp->grpmask)
191 rnp->gp_tasks = &t->rcu_node_entry; 201 rnp->gp_tasks = &t->rcu_node_entry;
192 } 202 }
203 trace_rcu_preempt_task(rdp->rsp->name,
204 t->pid,
205 (rnp->qsmask & rdp->grpmask)
206 ? rnp->gpnum
207 : rnp->gpnum + 1);
193 raw_spin_unlock_irqrestore(&rnp->lock, flags); 208 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 && 209 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) { 210 t->rcu_read_unlock_special) {
@@ -297,8 +312,12 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
297{ 312{
298 int empty; 313 int empty;
299 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
300 unsigned long flags; 316 unsigned long flags;
301 struct list_head *np; 317 struct list_head *np;
318#ifdef CONFIG_RCU_BOOST
319 struct rt_mutex *rbmp = NULL;
320#endif /* #ifdef CONFIG_RCU_BOOST */
302 struct rcu_node *rnp; 321 struct rcu_node *rnp;
303 int special; 322 int special;
304 323
@@ -344,6 +363,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 363 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
345 np = rcu_next_node_entry(t, rnp); 364 np = rcu_next_node_entry(t, rnp);
346 list_del_init(&t->rcu_node_entry); 365 list_del_init(&t->rcu_node_entry);
366 t->rcu_blocked_node = NULL;
367 trace_rcu_unlock_preempted_task("rcu_preempt",
368 rnp->gpnum, t->pid);
347 if (&t->rcu_node_entry == rnp->gp_tasks) 369 if (&t->rcu_node_entry == rnp->gp_tasks)
348 rnp->gp_tasks = np; 370 rnp->gp_tasks = np;
349 if (&t->rcu_node_entry == rnp->exp_tasks) 371 if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,38 +373,44 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
351#ifdef CONFIG_RCU_BOOST 373#ifdef CONFIG_RCU_BOOST
352 if (&t->rcu_node_entry == rnp->boost_tasks) 374 if (&t->rcu_node_entry == rnp->boost_tasks)
353 rnp->boost_tasks = np; 375 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ 376 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
355 if (t->rcu_boosted) { 377 if (t->rcu_boost_mutex) {
356 special |= RCU_READ_UNLOCK_BOOSTED; 378 rbmp = t->rcu_boost_mutex;
357 t->rcu_boosted = 0; 379 t->rcu_boost_mutex = NULL;
358 } 380 }
359#endif /* #ifdef CONFIG_RCU_BOOST */ 381#endif /* #ifdef CONFIG_RCU_BOOST */
360 t->rcu_blocked_node = NULL;
361 382
362 /* 383 /*
363 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
364 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
365 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
366 */ 388 */
367 if (empty) 389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
368 raw_spin_unlock_irqrestore(&rnp->lock, flags); 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
369 else 391 trace_rcu_quiescent_state_report("preempt_rcu",
392 rnp->gpnum,
393 0, rnp->qsmask,
394 rnp->level,
395 rnp->grplo,
396 rnp->grphi,
397 !!rnp->gp_tasks);
370 rcu_report_unblock_qs_rnp(rnp, flags); 398 rcu_report_unblock_qs_rnp(rnp, flags);
399 } else
400 raw_spin_unlock_irqrestore(&rnp->lock, flags);
371 401
372#ifdef CONFIG_RCU_BOOST 402#ifdef CONFIG_RCU_BOOST
373 /* Unboost if we were boosted. */ 403 /* Unboost if we were boosted. */
374 if (special & RCU_READ_UNLOCK_BOOSTED) { 404 if (rbmp)
375 rt_mutex_unlock(t->rcu_boost_mutex); 405 rt_mutex_unlock(rbmp);
376 t->rcu_boost_mutex = NULL;
377 }
378#endif /* #ifdef CONFIG_RCU_BOOST */ 406#endif /* #ifdef CONFIG_RCU_BOOST */
379 407
380 /* 408 /*
381 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
382 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
383 */ 411 */
384 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
385 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
386 } else { 414 } else {
387 local_irq_restore(flags); 415 local_irq_restore(flags);
388 } 416 }
@@ -399,10 +427,10 @@ void __rcu_read_unlock(void)
399{ 427{
400 struct task_struct *t = current; 428 struct task_struct *t = current;
401 429
402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
403 if (t->rcu_read_lock_nesting != 1) 430 if (t->rcu_read_lock_nesting != 1)
404 --t->rcu_read_lock_nesting; 431 --t->rcu_read_lock_nesting;
405 else { 432 else {
433 barrier(); /* critical section before exit code. */
406 t->rcu_read_lock_nesting = INT_MIN; 434 t->rcu_read_lock_nesting = INT_MIN;
407 barrier(); /* assign before ->rcu_read_unlock_special load */ 435 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 436 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +494,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
466 * Scan the current list of tasks blocked within RCU read-side critical 494 * Scan the current list of tasks blocked within RCU read-side critical
467 * sections, printing out the tid of each. 495 * sections, printing out the tid of each.
468 */ 496 */
469static void rcu_print_task_stall(struct rcu_node *rnp) 497static int rcu_print_task_stall(struct rcu_node *rnp)
470{ 498{
471 struct task_struct *t; 499 struct task_struct *t;
500 int ndetected = 0;
472 501
473 if (!rcu_preempt_blocked_readers_cgp(rnp)) 502 if (!rcu_preempt_blocked_readers_cgp(rnp))
474 return; 503 return 0;
475 t = list_entry(rnp->gp_tasks, 504 t = list_entry(rnp->gp_tasks,
476 struct task_struct, rcu_node_entry); 505 struct task_struct, rcu_node_entry);
477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 506 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
478 printk(" P%d", t->pid); 507 printk(" P%d", t->pid);
508 ndetected++;
509 }
510 return ndetected;
479} 511}
480 512
481/* 513/*
@@ -656,18 +688,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
656 */ 688 */
657void synchronize_rcu(void) 689void synchronize_rcu(void)
658{ 690{
659 struct rcu_synchronize rcu;
660
661 if (!rcu_scheduler_active) 691 if (!rcu_scheduler_active)
662 return; 692 return;
663 693 wait_rcu_gp(call_rcu);
664 init_rcu_head_on_stack(&rcu.head);
665 init_completion(&rcu.completion);
666 /* Will wake me after RCU finished. */
667 call_rcu(&rcu.head, wakeme_after_rcu);
668 /* Wait for it. */
669 wait_for_completion(&rcu.completion);
670 destroy_rcu_head_on_stack(&rcu.head);
671} 694}
672EXPORT_SYMBOL_GPL(synchronize_rcu); 695EXPORT_SYMBOL_GPL(synchronize_rcu);
673 696
@@ -709,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
709 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
710 * iteratively!) 733 * iteratively!)
711 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
712 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
713 */ 739 */
714static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
715{ 742{
716 unsigned long flags; 743 unsigned long flags;
717 unsigned long mask; 744 unsigned long mask;
@@ -724,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
724 } 751 }
725 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
726 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
727 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
728 break; 756 break;
729 } 757 }
730 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -757,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
757 must_wait = 1; 785 must_wait = 1;
758 } 786 }
759 if (!must_wait) 787 if (!must_wait)
760 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
761} 789}
762 790
763/* 791/*
@@ -968,8 +996,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
968 * Because preemptible RCU does not exist, we never have to check for 996 * Because preemptible RCU does not exist, we never have to check for
969 * tasks blocked within RCU read-side critical sections. 997 * tasks blocked within RCU read-side critical sections.
970 */ 998 */
971static void rcu_print_task_stall(struct rcu_node *rnp) 999static int rcu_print_task_stall(struct rcu_node *rnp)
972{ 1000{
1001 return 0;
973} 1002}
974 1003
975/* 1004/*
@@ -1048,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1048 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1049 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1050 */ 1079 */
1051static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1052{ 1082{
1053 return;
1054} 1083}
1055 1084
1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1199,12 +1228,12 @@ static int rcu_boost(struct rcu_node *rnp)
1199 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1200 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1201 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1203 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1206 1234
1207 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1208} 1237}
1209 1238
1210/* 1239/*
@@ -1228,9 +1257,12 @@ static int rcu_boost_kthread(void *arg)
1228 int spincnt = 0; 1257 int spincnt = 0;
1229 int more2boost; 1258 int more2boost;
1230 1259
1260 trace_rcu_utilization("Start boost kthread@init");
1231 for (;;) { 1261 for (;;) {
1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1262 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1263 trace_rcu_utilization("End boost kthread@rcu_wait");
1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1264 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1265 trace_rcu_utilization("Start boost kthread@rcu_wait");
1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1266 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235 more2boost = rcu_boost(rnp); 1267 more2boost = rcu_boost(rnp);
1236 if (more2boost) 1268 if (more2boost)
@@ -1238,11 +1270,14 @@ static int rcu_boost_kthread(void *arg)
1238 else 1270 else
1239 spincnt = 0; 1271 spincnt = 0;
1240 if (spincnt > 10) { 1272 if (spincnt > 10) {
1273 trace_rcu_utilization("End boost kthread@rcu_yield");
1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1274 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1275 trace_rcu_utilization("Start boost kthread@rcu_yield");
1242 spincnt = 0; 1276 spincnt = 0;
1243 } 1277 }
1244 } 1278 }
1245 /* NOTREACHED */ 1279 /* NOTREACHED */
1280 trace_rcu_utilization("End boost kthread@notreached");
1246 return 0; 1281 return 0;
1247} 1282}
1248 1283
@@ -1291,15 +1326,22 @@ static void invoke_rcu_callbacks_kthread(void)
1291 1326
1292 local_irq_save(flags); 1327 local_irq_save(flags);
1293 __this_cpu_write(rcu_cpu_has_work, 1); 1328 __this_cpu_write(rcu_cpu_has_work, 1);
1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { 1329 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1295 local_irq_restore(flags); 1330 current != __this_cpu_read(rcu_cpu_kthread_task))
1296 return; 1331 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299 local_irq_restore(flags); 1332 local_irq_restore(flags);
1300} 1333}
1301 1334
1302/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1303 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1304 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1305 * kthread. 1347 * kthread.
@@ -1343,13 +1385,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1343 if (rnp->boost_kthread_task != NULL) 1385 if (rnp->boost_kthread_task != NULL)
1344 return 0; 1386 return 0;
1345 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1387 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346 "rcub%d", rnp_index); 1388 "rcub/%d", rnp_index);
1347 if (IS_ERR(t)) 1389 if (IS_ERR(t))
1348 return PTR_ERR(t); 1390 return PTR_ERR(t);
1349 raw_spin_lock_irqsave(&rnp->lock, flags); 1391 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rnp->boost_kthread_task = t; 1392 rnp->boost_kthread_task = t;
1351 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1393 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352 sp.sched_priority = RCU_KTHREAD_PRIO; 1394 sp.sched_priority = RCU_BOOST_PRIO;
1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1395 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1396 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355 return 0; 1397 return 0;
@@ -1444,6 +1486,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{ 1486{
1445 struct sched_param sp; 1487 struct sched_param sp;
1446 struct timer_list yield_timer; 1488 struct timer_list yield_timer;
1489 int prio = current->rt_priority;
1447 1490
1448 setup_timer_on_stack(&yield_timer, f, arg); 1491 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2); 1492 mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1494,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); 1494 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19); 1495 set_user_nice(current, 19);
1453 schedule(); 1496 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO; 1497 set_user_nice(current, 0);
1498 sp.sched_priority = prio;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1499 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer); 1500 del_timer(&yield_timer);
1457} 1501}
@@ -1489,7 +1533,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1489 1533
1490/* 1534/*
1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1535 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1492 * earlier RCU softirq. 1536 * RCU softirq used in flavors and configurations of RCU that do not
1537 * support RCU priority boosting.
1493 */ 1538 */
1494static int rcu_cpu_kthread(void *arg) 1539static int rcu_cpu_kthread(void *arg)
1495{ 1540{
@@ -1500,9 +1545,12 @@ static int rcu_cpu_kthread(void *arg)
1500 char work; 1545 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu); 1546 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502 1547
1548 trace_rcu_utilization("Start CPU kthread@init");
1503 for (;;) { 1549 for (;;) {
1504 *statusp = RCU_KTHREAD_WAITING; 1550 *statusp = RCU_KTHREAD_WAITING;
1551 trace_rcu_utilization("End CPU kthread@rcu_wait");
1505 rcu_wait(*workp != 0 || kthread_should_stop()); 1552 rcu_wait(*workp != 0 || kthread_should_stop());
1553 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1506 local_bh_disable(); 1554 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) { 1555 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable(); 1556 local_bh_enable();
@@ -1523,11 +1571,14 @@ static int rcu_cpu_kthread(void *arg)
1523 spincnt = 0; 1571 spincnt = 0;
1524 if (spincnt > 10) { 1572 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING; 1573 *statusp = RCU_KTHREAD_YIELDING;
1574 trace_rcu_utilization("End CPU kthread@rcu_yield");
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); 1575 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1576 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1527 spincnt = 0; 1577 spincnt = 0;
1528 } 1578 }
1529 } 1579 }
1530 *statusp = RCU_KTHREAD_STOPPED; 1580 *statusp = RCU_KTHREAD_STOPPED;
1581 trace_rcu_utilization("End CPU kthread@term");
1531 return 0; 1582 return 0;
1532} 1583}
1533 1584
@@ -1560,7 +1611,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1560 if (!rcu_scheduler_fully_active || 1611 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL) 1612 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0; 1613 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); 1614 t = kthread_create_on_node(rcu_cpu_kthread,
1615 (void *)(long)cpu,
1616 cpu_to_node(cpu),
1617 "rcuc/%d", cpu);
1564 if (IS_ERR(t)) 1618 if (IS_ERR(t))
1565 return PTR_ERR(t); 1619 return PTR_ERR(t);
1566 if (cpu_online(cpu)) 1620 if (cpu_online(cpu))
@@ -1669,7 +1723,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1669 return 0; 1723 return 0;
1670 if (rnp->node_kthread_task == NULL) { 1724 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp, 1725 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index); 1726 "rcun/%d", rnp_index);
1673 if (IS_ERR(t)) 1727 if (IS_ERR(t))
1674 return PTR_ERR(t); 1728 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags); 1729 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1731,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1731 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1732} 1786}
1733 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1735{ 1794{
1736} 1795}
@@ -1866,7 +1925,7 @@ void synchronize_sched_expedited(void)
1866 * grace period works for us. 1925 * grace period works for us.
1867 */ 1926 */
1868 get_online_cpus(); 1927 get_online_cpus();
1869 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1870 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1871 } 1930 }
1872 1931
@@ -1898,113 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1898 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1899 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1900 * 1959 *
1901 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1902 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1903 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1904 */ 1962 */
1905int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1906{ 1964{
1907 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1908} 1973}
1909 1974
1910/* 1975/*
1911 * Check to see if we need to continue a callback-flush operations to 1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1912 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle 1977 * after it.
1913 * entry is not configured, so we never do need to.
1914 */ 1978 */
1915static void rcu_needs_cpu_flush(void) 1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1916{ 1988{
1917} 1989}
1918 1990
1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1920 1992
1921#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1922static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1923static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1924 2031
1925/* 2032/*
1926 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1927 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1928 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1929 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1930 * 2095 *
1931 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1932 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1933 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1934 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1935 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1936 * 2102 *
1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1938 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1940 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1941 */ 2109 */
1942int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1943{ 2111{
1944 int c = 0; 2112 unsigned long flags;
1945 int snap; 2113
1946 int thatcpu; 2114 local_irq_save(flags);
1947 2115
1948 /* Check for being in the holdoff period. */ 2116 /*
1949 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1950 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1951 2119 */
1952 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1953 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1954 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1955 continue; 2123 local_irq_restore(flags);
1956 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1957 thatcpu).dynticks); 2125 return;
1958 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1959 if ((snap & 0x1) != 0) { 2127
1960 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1961 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1962 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1963 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1964 } 2136 }
1965 2137
1966 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1967 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1968 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
1969 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
1970 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1971 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
1972 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1973 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
1974 } 2158 }
1975 2159
1976 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1977 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
1978 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
1979 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
1980 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
1981 } 2177 }
1982 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
1983 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
1984 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
1985 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
1986 } 2183 }
1987 2184
1988 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
1989 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
1990 invoke_rcu_core(); 2192 invoke_rcu_core();
1991 return c; 2193 } else {
1992} 2194 local_irq_restore(flags);
1993 2195 trace_rcu_prep_idle("Callbacks drained");
1994/* 2196 }
1995 * Check to see if we need to continue a callback-flush operations to
1996 * allow the last CPU to enter dyntick-idle mode.
1997 */
1998static void rcu_needs_cpu_flush(void)
1999{
2000 int cpu = smp_processor_id();
2001 unsigned long flags;
2002
2003 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2004 return;
2005 local_irq_save(flags);
2006 (void)rcu_needs_cpu(cpu);
2007 local_irq_restore(flags);
2008} 2197}
2009 2198
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc0..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
48 48
49#ifdef CONFIG_RCU_BOOST 49#ifdef CONFIG_RCU_BOOST
50 50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status) 51static char convert_kthread_status(unsigned int kthread_status)
57{ 52{
58 if (kthread_status > RCU_KTHREAD_MAX) 53 if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,19 +61,17 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
66{ 61{
67 if (!rdp->beenonline) 62 if (!rdp->beenonline)
68 return; 63 return;
69 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", 64 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
70 rdp->cpu, 65 rdp->cpu,
71 cpu_is_offline(rdp->cpu) ? '!' : ' ', 66 cpu_is_offline(rdp->cpu) ? '!' : ' ',
72 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
73 rdp->passed_quiesc, rdp->passed_quiesc_completed, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
74 rdp->qs_pending); 69 rdp->qs_pending);
75#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
76 seq_printf(m, " dt=%d/%d/%d df=%lu",
77 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
78 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
79 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
80 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
81#endif /* #ifdef CONFIG_NO_HZ */
82 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
83 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
84 rdp->qlen, 77 rdp->qlen,
@@ -144,15 +137,13 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->cpu, 137 rdp->cpu,
145 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 138 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
146 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
147 rdp->passed_quiesc, rdp->passed_quiesc_completed, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
148 rdp->qs_pending); 141 rdp->qs_pending);
149#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
150 seq_printf(m, ",%d,%d,%d,%lu",
151 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
152 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
153 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
154 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
155#endif /* #ifdef CONFIG_NO_HZ */
156 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
157 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
158 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -175,10 +166,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
175 166
176static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
177{ 168{
178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
179#ifdef CONFIG_NO_HZ
180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
181#endif /* #ifdef CONFIG_NO_HZ */
182 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
183#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
184 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -283,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
284 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
285 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
286 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
287 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
288 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
289 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/relay.c b/kernel/relay.c
index 859ea5a9605f..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/module.h> 18#include <linux/export.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/relay.h> 20#include <linux/relay.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
302 */ 302 */
303static struct dentry *create_buf_file_default_callback(const char *filename, 303static struct dentry *create_buf_file_default_callback(const char *filename,
304 struct dentry *parent, 304 struct dentry *parent,
305 int mode, 305 umode_t mode,
306 struct rchan_buf *buf, 306 struct rchan_buf *buf,
307 int *is_global) 307 int *is_global)
308{ 308{
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..6d269cce7aa1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,
159 return 0; 159 return 0;
160 } 160 }
161 161
162 /* FIXME - make memparse() take const char* args */ 162 *res = memparse(buf, &end);
163 *res = memparse((char *)buf, &end);
164 if (*end != '\0') 163 if (*end != '\0')
165 return -EINVAL; 164 return -EINVAL;
166 165
diff --git a/kernel/resource.c b/kernel/resource.c
index c8dc249da5ce..7640b3a947d0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,7 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/errno.h> 11#include <linux/errno.h>
12#include <linux/ioport.h> 12#include <linux/ioport.h>
13#include <linux/init.h> 13#include <linux/init.h>
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33be..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/module.h> 21#include <linux/export.h>
22#include <linux/spinlock.h> 22#include <linux/spinlock.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
@@ -29,61 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32# define TRACE_WARN_ON(x) WARN_ON(x)
33# define TRACE_BUG_ON(x) BUG_ON(x)
34
35# define TRACE_OFF() \
36do { \
37 if (rt_trace_on) { \
38 rt_trace_on = 0; \
39 console_verbose(); \
40 if (raw_spin_is_locked(&current->pi_lock)) \
41 raw_spin_unlock(&current->pi_lock); \
42 } \
43} while (0)
44
45# define TRACE_OFF_NOLOCK() \
46do { \
47 if (rt_trace_on) { \
48 rt_trace_on = 0; \
49 console_verbose(); \
50 } \
51} while (0)
52
53# define TRACE_BUG_LOCKED() \
54do { \
55 TRACE_OFF(); \
56 BUG(); \
57} while (0)
58
59# define TRACE_WARN_ON_LOCKED(c) \
60do { \
61 if (unlikely(c)) { \
62 TRACE_OFF(); \
63 WARN_ON(1); \
64 } \
65} while (0)
66
67# define TRACE_BUG_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) \
70 TRACE_BUG_LOCKED(); \
71} while (0)
72
73#ifdef CONFIG_SMP
74# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
75#else
76# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
77#endif
78
79/*
80 * deadlock detection flag. We turn it off when we detect
81 * the first problem because we dont want to recurse back
82 * into the tracing code when doing error printk or
83 * executing a BUG():
84 */
85static int rt_trace_on = 1;
86
87static void printk_task(struct task_struct *p) 32static void printk_task(struct task_struct *p)
88{ 33{
89 if (p) 34 if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
111 56
112void rt_mutex_debug_task_free(struct task_struct *task) 57void rt_mutex_debug_task_free(struct task_struct *task)
113{ 58{
114 WARN_ON(!plist_head_empty(&task->pi_waiters)); 59 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
115 WARN_ON(task->pi_blocked_on); 60 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
116} 61}
117 62
118/* 63/*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
125{ 70{
126 struct task_struct *task; 71 struct task_struct *task;
127 72
128 if (!rt_trace_on || detect || !act_waiter) 73 if (!debug_locks || detect || !act_waiter)
129 return; 74 return;
130 75
131 task = rt_mutex_owner(act_waiter->lock); 76 task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
139{ 84{
140 struct task_struct *task; 85 struct task_struct *task;
141 86
142 if (!waiter->deadlock_lock || !rt_trace_on) 87 if (!waiter->deadlock_lock || !debug_locks)
143 return; 88 return;
144 89
145 rcu_read_lock(); 90 rcu_read_lock();
@@ -149,10 +94,14 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
149 return; 94 return;
150 } 95 }
151 96
152 TRACE_OFF_NOLOCK(); 97 if (!debug_locks_off()) {
98 rcu_read_unlock();
99 return;
100 }
153 101
154 printk("\n============================================\n"); 102 printk("\n============================================\n");
155 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
156 printk( "--------------------------------------------\n"); 105 printk( "--------------------------------------------\n");
157 printk("%s/%d is deadlocking current task %s/%d\n\n", 106 printk("%s/%d is deadlocking current task %s/%d\n\n",
158 task->comm, task_pid_nr(task), 107 task->comm, task_pid_nr(task),
@@ -180,7 +129,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
180 129
181 printk("[ turning off deadlock detection." 130 printk("[ turning off deadlock detection."
182 "Please report this trace. ]\n\n"); 131 "Please report this trace. ]\n\n");
183 local_irq_disable();
184} 132}
185 133
186void debug_rt_mutex_lock(struct rt_mutex *lock) 134void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +137,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
189 137
190void debug_rt_mutex_unlock(struct rt_mutex *lock) 138void debug_rt_mutex_unlock(struct rt_mutex *lock)
191{ 139{
192 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); 140 DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
193} 141}
194 142
195void 143void
@@ -199,7 +147,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
199 147
200void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) 148void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
201{ 149{
202 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); 150 DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
203} 151}
204 152
205void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 153void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +161,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
213void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 161void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
214{ 162{
215 put_pid(waiter->deadlock_task_pid); 163 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 164 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 165 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 memset(waiter, 0x22, sizeof(*waiter)); 166 memset(waiter, 0x22, sizeof(*waiter));
219} 167}
220 168
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 5c9ccd380966..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> 6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 * 7 *
8 */ 8 */
9#include <linux/device.h>
9#include <linux/kthread.h> 10#include <linux/kthread.h>
10#include <linux/module.h> 11#include <linux/export.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/sysdev.h>
14#include <linux/timer.h> 14#include <linux/timer.h>
15#include <linux/freezer.h> 15#include <linux/freezer.h>
16 16
@@ -27,7 +27,7 @@ struct test_thread_data {
27 int opdata; 27 int opdata;
28 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
29 int event; 29 int event;
30 struct sys_device sysdev; 30 struct device dev;
31}; 31};
32 32
33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; 33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
271 * 271 *
272 * opcode:data 272 * opcode:data
273 */ 273 */
274static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, 274static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
275 const char *buf, size_t count) 275 const char *buf, size_t count)
276{ 276{
277 struct sched_param schedpar; 277 struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
279 char cmdbuf[32]; 279 char cmdbuf[32];
280 int op, dat, tid, ret; 280 int op, dat, tid, ret;
281 281
282 td = container_of(dev, struct test_thread_data, sysdev); 282 td = container_of(dev, struct test_thread_data, dev);
283 tid = td->sysdev.id; 283 tid = td->dev.id;
284 284
285 /* strings from sysfs write are not 0 terminated! */ 285 /* strings from sysfs write are not 0 terminated! */
286 if (count >= sizeof(cmdbuf)) 286 if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
334 * @dev: thread to query 334 * @dev: thread to query
335 * @buf: char buffer to be filled with thread status info 335 * @buf: char buffer to be filled with thread status info
336 */ 336 */
337static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, 337static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
338 char *buf) 338 char *buf)
339{ 339{
340 struct test_thread_data *td; 340 struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
342 char *curr = buf; 342 char *curr = buf;
343 int i; 343 int i;
344 344
345 td = container_of(dev, struct test_thread_data, sysdev); 345 td = container_of(dev, struct test_thread_data, dev);
346 tsk = threads[td->sysdev.id]; 346 tsk = threads[td->dev.id];
347 347
348 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
349 349
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
360 spin_unlock(&rttest_lock); 360 spin_unlock(&rttest_lock);
361 361
362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk, 362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
363 mutexes[td->sysdev.id].owner); 363 mutexes[td->dev.id].owner);
364 364
365 return curr - buf; 365 return curr - buf;
366} 366}
367 367
368static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); 368static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
369static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 369static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
370 370
371static struct sysdev_class rttest_sysclass = { 371static struct bus_type rttest_subsys = {
372 .name = "rttest", 372 .name = "rttest",
373 .dev_name = "rttest",
373}; 374};
374 375
375static int init_test_thread(int id) 376static int init_test_thread(int id)
376{ 377{
377 thread_data[id].sysdev.cls = &rttest_sysclass; 378 thread_data[id].dev.bus = &rttest_subsys;
378 thread_data[id].sysdev.id = id; 379 thread_data[id].dev.id = id;
379 380
380 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); 381 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
381 if (IS_ERR(threads[id])) 382 if (IS_ERR(threads[id]))
382 return PTR_ERR(threads[id]); 383 return PTR_ERR(threads[id]);
383 384
384 return sysdev_register(&thread_data[id].sysdev); 385 return device_register(&thread_data[id].dev);
385} 386}
386 387
387static int init_rttest(void) 388static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
393 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) 394 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
394 rt_mutex_init(&mutexes[i]); 395 rt_mutex_init(&mutexes[i]);
395 396
396 ret = sysdev_class_register(&rttest_sysclass); 397 ret = subsys_system_register(&rttest_subsys, NULL);
397 if (ret) 398 if (ret)
398 return ret; 399 return ret;
399 400
@@ -401,10 +402,10 @@ static int init_rttest(void)
401 ret = init_test_thread(i); 402 ret = init_test_thread(i);
402 if (ret) 403 if (ret)
403 break; 404 break;
404 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); 405 ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
405 if (ret) 406 if (ret)
406 break; 407 break;
407 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); 408 ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
408 if (ret) 409 if (ret)
409 break; 410 break;
410 } 411 }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acdb..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
11 * See Documentation/rt-mutex-design.txt for details. 11 * See Documentation/rt-mutex-design.txt for details.
12 */ 12 */
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/timer.h> 16#include <linux/timer.h>
17 17
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9f48f3d82e9b..b152f74f02de 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,7 +7,7 @@
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h> 13#include <asm/system.h>
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index 9d8af0b3fb64..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
@@ -62,7 +62,7 @@
62 */ 62 */
63#include <linux/spinlock.h> 63#include <linux/spinlock.h>
64#include <linux/hardirq.h> 64#include <linux/hardirq.h>
65#include <linux/module.h> 65#include <linux/export.h>
66#include <linux/percpu.h> 66#include <linux/percpu.h>
67#include <linux/ktime.h> 67#include <linux/ktime.h>
68#include <linux/sched.h> 68#include <linux/sched.h>
diff --git a/kernel/sched.c b/kernel/sched/core.c
index 5670028a9c16..df00cb09263e 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -71,593 +70,46 @@
71#include <linux/ctype.h> 70#include <linux/ctype.h>
72#include <linux/ftrace.h> 71#include <linux/ftrace.h>
73#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h>
74 74
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h> 78#include <asm/paravirt.h>
80#endif 79#endif
81 80
82#include "sched_cpupri.h" 81#include "sched.h"
83#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
84#include "sched_autogroup.h"
85 83
86#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
87#include <trace/events/sched.h> 85#include <trace/events/sched.h>
88 86
89/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
90 * Convert user-nice values [ -20 ... 0 ... 19 ]
91 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
92 * and back.
93 */
94#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
95#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
96#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
97
98/*
99 * 'User priority' is the nice value converted to something we
100 * can work with better when scaling various scheduler parameters,
101 * it's a [ 0 ... 39 ] range.
102 */
103#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
104#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
105#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
106
107/*
108 * Helpers for converting nanosecond timing to jiffy resolution
109 */
110#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
111
112#define NICE_0_LOAD SCHED_LOAD_SCALE
113#define NICE_0_SHIFT SCHED_LOAD_SHIFT
114
115/*
116 * These are the 'tuning knobs' of the scheduler:
117 *
118 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
119 * Timeslices get refilled after they expire.
120 */
121#define DEF_TIMESLICE (100 * HZ / 1000)
122
123/*
124 * single value that denotes runtime == period, ie unlimited time.
125 */
126#define RUNTIME_INF ((u64)~0ULL)
127
128static inline int rt_policy(int policy)
129{
130 if (policy == SCHED_FIFO || policy == SCHED_RR)
131 return 1;
132 return 0;
133}
134
135static inline int task_has_rt_policy(struct task_struct *p)
136{
137 return rt_policy(p->policy);
138}
139
140/*
141 * This is the priority-queue data structure of the RT scheduling class:
142 */
143struct rt_prio_array {
144 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
145 struct list_head queue[MAX_RT_PRIO];
146};
147
148struct rt_bandwidth {
149 /* nests inside the rq lock: */
150 raw_spinlock_t rt_runtime_lock;
151 ktime_t rt_period;
152 u64 rt_runtime;
153 struct hrtimer rt_period_timer;
154};
155
156static struct rt_bandwidth def_rt_bandwidth;
157
158static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
159
160static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
161{
162 struct rt_bandwidth *rt_b =
163 container_of(timer, struct rt_bandwidth, rt_period_timer);
164 ktime_t now;
165 int overrun;
166 int idle = 0;
167
168 for (;;) {
169 now = hrtimer_cb_get_time(timer);
170 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
171
172 if (!overrun)
173 break;
174
175 idle = do_sched_rt_period_timer(rt_b, overrun);
176 }
177
178 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
179}
180
181static
182void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
183{
184 rt_b->rt_period = ns_to_ktime(period);
185 rt_b->rt_runtime = runtime;
186
187 raw_spin_lock_init(&rt_b->rt_runtime_lock);
188
189 hrtimer_init(&rt_b->rt_period_timer,
190 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
191 rt_b->rt_period_timer.function = sched_rt_period_timer;
192}
193
194static inline int rt_bandwidth_enabled(void)
195{ 88{
196 return sysctl_sched_rt_runtime >= 0; 89 unsigned long delta;
197} 90 ktime_t soft, hard, now;
198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200{
201 ktime_t now;
202
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return;
205 91
206 if (hrtimer_active(&rt_b->rt_period_timer))
207 return;
208
209 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 92 for (;;) {
211 unsigned long delta; 93 if (hrtimer_active(period_timer))
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break; 94 break;
216 95
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 96 now = hrtimer_cb_get_time(period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 97 hrtimer_forward(period_timer, now, period);
219 98
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 99 soft = hrtimer_get_softexpires(period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 100 hard = hrtimer_get_expires(period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft)); 101 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 102 __hrtimer_start_range_ns(period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0); 103 HRTIMER_MODE_ABS_PINNED, 0);
225 } 104 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 105}
228 106
229#ifdef CONFIG_RT_GROUP_SCHED 107DEFINE_MUTEX(sched_domains_mutex);
230static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
231{
232 hrtimer_cancel(&rt_b->rt_period_timer);
233}
234#endif
235
236/*
237 * sched_domains_mutex serializes calls to init_sched_domains,
238 * detach_destroy_domains and partition_sched_domains.
239 */
240static DEFINE_MUTEX(sched_domains_mutex);
241
242#ifdef CONFIG_CGROUP_SCHED
243
244#include <linux/cgroup.h>
245
246struct cfs_rq;
247
248static LIST_HEAD(task_groups);
249
250/* task group related information */
251struct task_group {
252 struct cgroup_subsys_state css;
253
254#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */
256 struct sched_entity **se;
257 /* runqueue "owned" by this group on each cpu */
258 struct cfs_rq **cfs_rq;
259 unsigned long shares;
260
261 atomic_t load_weight;
262#endif
263
264#ifdef CONFIG_RT_GROUP_SCHED
265 struct sched_rt_entity **rt_se;
266 struct rt_rq **rt_rq;
267
268 struct rt_bandwidth rt_bandwidth;
269#endif
270
271 struct rcu_head rcu;
272 struct list_head list;
273
274 struct task_group *parent;
275 struct list_head siblings;
276 struct list_head children;
277
278#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup;
280#endif
281};
282
283/* task_group_lock serializes the addition/removal of task groups */
284static DEFINE_SPINLOCK(task_group_lock);
285
286#ifdef CONFIG_FAIR_GROUP_SCHED
287
288# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
289
290/*
291 * A weight of 0 or 1 can cause arithmetics problems.
292 * A weight of a cfs_rq is the sum of weights of which entities
293 * are queued on this cfs_rq, so a weight of a entity should not be
294 * too large, so as the shares value of a task group.
295 * (The default weight is 1024 - so there's no practical
296 * limitation from this.)
297 */
298#define MIN_SHARES (1UL << 1)
299#define MAX_SHARES (1UL << 18)
300
301static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
302#endif
303
304/* Default task group.
305 * Every task in system belong to this group at bootup.
306 */
307struct task_group root_task_group;
308
309#endif /* CONFIG_CGROUP_SCHED */
310
311/* CFS-related fields in a runqueue */
312struct cfs_rq {
313 struct load_weight load;
314 unsigned long nr_running;
315
316 u64 exec_clock;
317 u64 min_vruntime;
318#ifndef CONFIG_64BIT
319 u64 min_vruntime_copy;
320#endif
321
322 struct rb_root tasks_timeline;
323 struct rb_node *rb_leftmost;
324
325 struct list_head tasks;
326 struct list_head *balance_iterator;
327
328 /*
329 * 'curr' points to currently running entity on this cfs_rq.
330 * It is set to NULL otherwise (i.e when none are currently running).
331 */
332 struct sched_entity *curr, *next, *last, *skip;
333
334#ifdef CONFIG_SCHED_DEBUG
335 unsigned int nr_spread_over;
336#endif
337
338#ifdef CONFIG_FAIR_GROUP_SCHED
339 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
340
341 /*
342 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
343 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
344 * (like users, containers etc.)
345 *
346 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
347 * list is used during load balance.
348 */
349 int on_list;
350 struct list_head leaf_cfs_rq_list;
351 struct task_group *tg; /* group that "owns" this runqueue */
352
353#ifdef CONFIG_SMP
354 /*
355 * the part of load.weight contributed by tasks
356 */
357 unsigned long task_weight;
358
359 /*
360 * h_load = weight * f(tg)
361 *
362 * Where f(tg) is the recursive weight fraction assigned to
363 * this group.
364 */
365 unsigned long h_load;
366
367 /*
368 * Maintaining per-cpu shares distribution for group scheduling
369 *
370 * load_stamp is the last time we updated the load average
371 * load_last is the last time we updated the load average and saw load
372 * load_unacc_exec_time is currently unaccounted execution time
373 */
374 u64 load_avg;
375 u64 load_period;
376 u64 load_stamp, load_last, load_unacc_exec_time;
377
378 unsigned long load_contribution;
379#endif
380#endif
381};
382
383/* Real-Time classes' related field in a runqueue: */
384struct rt_rq {
385 struct rt_prio_array active;
386 unsigned long rt_nr_running;
387#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
388 struct {
389 int curr; /* highest queued rt task prio */
390#ifdef CONFIG_SMP
391 int next; /* next highest */
392#endif
393 } highest_prio;
394#endif
395#ifdef CONFIG_SMP
396 unsigned long rt_nr_migratory;
397 unsigned long rt_nr_total;
398 int overloaded;
399 struct plist_head pushable_tasks;
400#endif
401 int rt_throttled;
402 u64 rt_time;
403 u64 rt_runtime;
404 /* Nests inside the rq lock: */
405 raw_spinlock_t rt_runtime_lock;
406
407#ifdef CONFIG_RT_GROUP_SCHED
408 unsigned long rt_nr_boosted;
409
410 struct rq *rq;
411 struct list_head leaf_rt_rq_list;
412 struct task_group *tg;
413#endif
414};
415
416#ifdef CONFIG_SMP
417
418/*
419 * We add the notion of a root-domain which will be used to define per-domain
420 * variables. Each exclusive cpuset essentially defines an island domain by
421 * fully partitioning the member cpus from any other cpuset. Whenever a new
422 * exclusive cpuset is created, we also create and attach a new root-domain
423 * object.
424 *
425 */
426struct root_domain {
427 atomic_t refcount;
428 atomic_t rto_count;
429 struct rcu_head rcu;
430 cpumask_var_t span;
431 cpumask_var_t online;
432
433 /*
434 * The "RT overload" flag: it gets set if a CPU has more than
435 * one runnable RT task.
436 */
437 cpumask_var_t rto_mask;
438 struct cpupri cpupri;
439};
440
441/*
442 * By default the system creates a single root-domain with all cpus as
443 * members (mimicking the global state we have today).
444 */
445static struct root_domain def_root_domain;
446
447#endif /* CONFIG_SMP */
448
449/*
450 * This is the main, per-CPU runqueue data structure.
451 *
452 * Locking rule: those places that want to lock multiple runqueues
453 * (such as the load balancing or the thread migration code), lock
454 * acquire operations must be ordered by ascending &runqueue.
455 */
456struct rq {
457 /* runqueue lock: */
458 raw_spinlock_t lock;
459
460 /*
461 * nr_running and cpu_load should be in the same cacheline because
462 * remote CPUs use both these fields when doing load calculation.
463 */
464 unsigned long nr_running;
465 #define CPU_LOAD_IDX_MAX 5
466 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
467 unsigned long last_load_update_tick;
468#ifdef CONFIG_NO_HZ
469 u64 nohz_stamp;
470 unsigned char nohz_balance_kick;
471#endif
472 int skip_clock_update;
473
474 /* capture load from *all* tasks on this cpu: */
475 struct load_weight load;
476 unsigned long nr_load_updates;
477 u64 nr_switches;
478
479 struct cfs_rq cfs;
480 struct rt_rq rt;
481
482#ifdef CONFIG_FAIR_GROUP_SCHED
483 /* list of leaf cfs_rq on this cpu: */
484 struct list_head leaf_cfs_rq_list;
485#endif
486#ifdef CONFIG_RT_GROUP_SCHED
487 struct list_head leaf_rt_rq_list;
488#endif
489
490 /*
491 * This is part of a global counter where only the total sum
492 * over all CPUs matters. A task can increase this counter on
493 * one CPU and if it got migrated afterwards it may decrease
494 * it on another CPU. Always updated under the runqueue lock:
495 */
496 unsigned long nr_uninterruptible;
497
498 struct task_struct *curr, *idle, *stop;
499 unsigned long next_balance;
500 struct mm_struct *prev_mm;
501
502 u64 clock;
503 u64 clock_task;
504
505 atomic_t nr_iowait;
506
507#ifdef CONFIG_SMP
508 struct root_domain *rd;
509 struct sched_domain *sd;
510
511 unsigned long cpu_power;
512
513 unsigned char idle_at_tick;
514 /* For active balancing */
515 int post_schedule;
516 int active_balance;
517 int push_cpu;
518 struct cpu_stop_work active_balance_work;
519 /* cpu of this runqueue: */
520 int cpu;
521 int online;
522
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg;
526 u64 age_stamp;
527 u64 idle_stamp;
528 u64 avg_idle;
529#endif
530
531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
532 u64 prev_irq_time;
533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
540
541 /* calc_load related fields */
542 unsigned long calc_load_update;
543 long calc_load_active;
544
545#ifdef CONFIG_SCHED_HRTICK
546#ifdef CONFIG_SMP
547 int hrtick_csd_pending;
548 struct call_single_data hrtick_csd;
549#endif
550 struct hrtimer hrtick_timer;
551#endif
552
553#ifdef CONFIG_SCHEDSTATS
554 /* latency stats */
555 struct sched_info rq_sched_info;
556 unsigned long long rq_cpu_time;
557 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
558
559 /* sys_sched_yield() stats */
560 unsigned int yld_count;
561
562 /* schedule() stats */
563 unsigned int sched_switch;
564 unsigned int sched_count;
565 unsigned int sched_goidle;
566
567 /* try_to_wake_up() stats */
568 unsigned int ttwu_count;
569 unsigned int ttwu_local;
570#endif
571
572#ifdef CONFIG_SMP
573 struct task_struct *wake_list;
574#endif
575};
576
577static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
578
579
580static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
581
582static inline int cpu_of(struct rq *rq)
583{
584#ifdef CONFIG_SMP
585 return rq->cpu;
586#else
587 return 0;
588#endif
589}
590
591#define rcu_dereference_check_sched_domain(p) \
592 rcu_dereference_check((p), \
593 lockdep_is_held(&sched_domains_mutex))
594
595/*
596 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
597 * See detach_destroy_domains: synchronize_sched for details.
598 *
599 * The domain tree of any CPU may only be accessed from within
600 * preempt-disabled sections.
601 */
602#define for_each_domain(cpu, __sd) \
603 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
604
605#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
606#define this_rq() (&__get_cpu_var(runqueues))
607#define task_rq(p) cpu_rq(task_cpu(p))
608#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
609#define raw_rq() (&__raw_get_cpu_var(runqueues))
610
611#ifdef CONFIG_CGROUP_SCHED
612
613/*
614 * Return the group to which this tasks belongs.
615 *
616 * We use task_subsys_state_check() and extend the RCU verification with
617 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
618 * task it moves into the cgroup. Therefore by holding either of those locks,
619 * we pin the task to the current cgroup.
620 */
621static inline struct task_group *task_group(struct task_struct *p)
622{
623 struct task_group *tg;
624 struct cgroup_subsys_state *css;
625
626 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
627 lockdep_is_held(&p->pi_lock) ||
628 lockdep_is_held(&task_rq(p)->lock));
629 tg = container_of(css, struct task_group, css);
630
631 return autogroup_task_group(p, tg);
632}
633
634/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
635static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
636{
637#ifdef CONFIG_FAIR_GROUP_SCHED
638 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
639 p->se.parent = task_group(p)->se[cpu];
640#endif
641
642#ifdef CONFIG_RT_GROUP_SCHED
643 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
644 p->rt.parent = task_group(p)->rt_se[cpu];
645#endif
646}
647
648#else /* CONFIG_CGROUP_SCHED */
649
650static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
651static inline struct task_group *task_group(struct task_struct *p)
652{
653 return NULL;
654}
655
656#endif /* CONFIG_CGROUP_SCHED */
657 109
658static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
659 111
660static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
661{ 113{
662 s64 delta; 114 s64 delta;
663 115
@@ -670,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
670} 122}
671 123
672/* 124/*
673 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
674 */
675#ifdef CONFIG_SCHED_DEBUG
676# define const_debug __read_mostly
677#else
678# define const_debug static const
679#endif
680
681/**
682 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
683 * @cpu: the processor in question.
684 *
685 * This interface allows printk to be called with the runqueue lock
686 * held and know whether or not it is OK to wake up the klogd.
687 */
688int runqueue_is_locked(int cpu)
689{
690 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
691}
692
693/*
694 * Debugging: various feature bits 125 * Debugging: various feature bits
695 */ 126 */
696 127
697#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
698 __SCHED_FEAT_##name ,
699
700enum {
701#include "sched_features.h"
702};
703
704#undef SCHED_FEAT
705
706#define SCHED_FEAT(name, enabled) \
707 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
708 130
709const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
710#include "sched_features.h" 132#include "features.h"
711 0; 133 0;
712 134
713#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -717,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
717 #name , 139 #name ,
718 140
719static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
720#include "sched_features.h" 142#include "features.h"
721 NULL 143 NULL
722}; 144};
723 145
@@ -727,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
727{ 149{
728 int i; 150 int i;
729 151
730 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
731 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
732 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
733 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -737,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
737 return 0; 159 return 0;
738} 160}
739 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
740static ssize_t 192static ssize_t
741sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
742 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -760,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
760 cmp += 3; 212 cmp += 3;
761 } 213 }
762 214
763 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
764 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
765 if (neg) 217 if (neg) {
766 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
767 else 219 sched_feat_disable(i);
220 } else {
768 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
769 break; 224 break;
770 } 225 }
771 } 226 }
772 227
773 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
774 return -EINVAL; 229 return -EINVAL;
775 230
776 *ppos += cnt; 231 *ppos += cnt;
@@ -799,10 +254,7 @@ static __init int sched_init_debug(void)
799 return 0; 254 return 0;
800} 255}
801late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
802 257#endif /* CONFIG_SCHED_DEBUG */
803#endif
804
805#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
806 258
807/* 259/*
808 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -824,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
824 */ 276 */
825unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
826 278
827static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
828 280
829/* 281/*
830 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -832,112 +284,7 @@ static __read_mostly int scheduler_running;
832 */ 284 */
833int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
834 286
835static inline u64 global_rt_period(void)
836{
837 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
838}
839
840static inline u64 global_rt_runtime(void)
841{
842 if (sysctl_sched_rt_runtime < 0)
843 return RUNTIME_INF;
844
845 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
846}
847
848#ifndef prepare_arch_switch
849# define prepare_arch_switch(next) do { } while (0)
850#endif
851#ifndef finish_arch_switch
852# define finish_arch_switch(prev) do { } while (0)
853#endif
854
855static inline int task_current(struct rq *rq, struct task_struct *p)
856{
857 return rq->curr == p;
858}
859
860static inline int task_running(struct rq *rq, struct task_struct *p)
861{
862#ifdef CONFIG_SMP
863 return p->on_cpu;
864#else
865 return task_current(rq, p);
866#endif
867}
868
869#ifndef __ARCH_WANT_UNLOCKED_CTXSW
870static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
871{
872#ifdef CONFIG_SMP
873 /*
874 * We can optimise this out completely for !SMP, because the
875 * SMP rebalancing from interrupt is the only thing that cares
876 * here.
877 */
878 next->on_cpu = 1;
879#endif
880}
881 287
882static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
883{
884#ifdef CONFIG_SMP
885 /*
886 * After ->on_cpu is cleared, the task can be moved to a different CPU.
887 * We must ensure this doesn't happen until the switch is completely
888 * finished.
889 */
890 smp_wmb();
891 prev->on_cpu = 0;
892#endif
893#ifdef CONFIG_DEBUG_SPINLOCK
894 /* this is a valid case when another task releases the spinlock */
895 rq->lock.owner = current;
896#endif
897 /*
898 * If we are tracking spinlock dependencies then we have to
899 * fix up the runqueue lock - which gets 'carried over' from
900 * prev into current:
901 */
902 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
903
904 raw_spin_unlock_irq(&rq->lock);
905}
906
907#else /* __ARCH_WANT_UNLOCKED_CTXSW */
908static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
909{
910#ifdef CONFIG_SMP
911 /*
912 * We can optimise this out completely for !SMP, because the
913 * SMP rebalancing from interrupt is the only thing that cares
914 * here.
915 */
916 next->on_cpu = 1;
917#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 raw_spin_unlock_irq(&rq->lock);
920#else
921 raw_spin_unlock(&rq->lock);
922#endif
923}
924
925static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
926{
927#ifdef CONFIG_SMP
928 /*
929 * After ->on_cpu is cleared, the task can be moved to a different CPU.
930 * We must ensure this doesn't happen until the switch is completely
931 * finished.
932 */
933 smp_wmb();
934 prev->on_cpu = 0;
935#endif
936#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
937 local_irq_enable();
938#endif
939}
940#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
941 288
942/* 289/*
943 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1020,20 +367,6 @@ static struct rq *this_rq_lock(void)
1020 * rq->lock. 367 * rq->lock.
1021 */ 368 */
1022 369
1023/*
1024 * Use hrtick when:
1025 * - enabled by features
1026 * - hrtimer is actually high res
1027 */
1028static inline int hrtick_enabled(struct rq *rq)
1029{
1030 if (!sched_feat(HRTICK))
1031 return 0;
1032 if (!cpu_active(cpu_of(rq)))
1033 return 0;
1034 return hrtimer_is_hres_active(&rq->hrtick_timer);
1035}
1036
1037static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1038{ 371{
1039 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1077,7 +410,7 @@ static void __hrtick_start(void *arg)
1077 * 410 *
1078 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1079 */ 412 */
1080static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1081{ 414{
1082 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1083 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1121,7 +454,7 @@ static __init void init_hrtick(void)
1121 * 454 *
1122 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1123 */ 456 */
1124static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1125{ 458{
1126 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1127 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1172,7 +505,7 @@ static inline void init_hrtick(void)
1172#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1173#endif 506#endif
1174 507
1175static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1176{ 509{
1177 int cpu; 510 int cpu;
1178 511
@@ -1193,7 +526,7 @@ static void resched_task(struct task_struct *p)
1193 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1194} 527}
1195 528
1196static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1197{ 530{
1198 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1199 unsigned long flags; 532 unsigned long flags;
@@ -1272,14 +605,22 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 605 smp_send_reschedule(cpu);
1273} 606}
1274 607
1275#endif /* CONFIG_NO_HZ */ 608static inline bool got_nohz_idle_kick(void)
609{
610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
612}
1276 613
1277static u64 sched_avg_period(void) 614#else /* CONFIG_NO_HZ */
615
616static inline bool got_nohz_idle_kick(void)
1278{ 617{
1279 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 618 return false;
1280} 619}
1281 620
1282static void sched_avg_update(struct rq *rq) 621#endif /* CONFIG_NO_HZ */
622
623void sched_avg_update(struct rq *rq)
1283{ 624{
1284 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1285 626
@@ -1295,200 +636,34 @@ static void sched_avg_update(struct rq *rq)
1295 } 636 }
1296} 637}
1297 638
1298static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1299{
1300 rq->rt_avg += rt_delta;
1301 sched_avg_update(rq);
1302}
1303
1304#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1305static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1306{ 641{
1307 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1308 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1309} 644}
1310
1311static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1312{
1313}
1314
1315static void sched_avg_update(struct rq *rq)
1316{
1317}
1318#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1319 646
1320#if BITS_PER_LONG == 32 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1321# define WMULT_CONST (~0UL) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1322#else
1323# define WMULT_CONST (1UL << 32)
1324#endif
1325
1326#define WMULT_SHIFT 32
1327
1328/*
1329 * Shift right and round:
1330 */
1331#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1332
1333/*
1334 * delta *= weight / lw
1335 */
1336static unsigned long
1337calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1338 struct load_weight *lw)
1339{
1340 u64 tmp;
1341
1342 /*
1343 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1344 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1345 * 2^SCHED_LOAD_RESOLUTION.
1346 */
1347 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1348 tmp = (u64)delta_exec * scale_load_down(weight);
1349 else
1350 tmp = (u64)delta_exec;
1351
1352 if (!lw->inv_weight) {
1353 unsigned long w = scale_load_down(lw->weight);
1354
1355 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1356 lw->inv_weight = 1;
1357 else if (unlikely(!w))
1358 lw->inv_weight = WMULT_CONST;
1359 else
1360 lw->inv_weight = WMULT_CONST / w;
1361 }
1362
1363 /*
1364 * Check whether we'd overflow the 64-bit multiplication:
1365 */
1366 if (unlikely(tmp > WMULT_CONST))
1367 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1368 WMULT_SHIFT/2);
1369 else
1370 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1371
1372 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1373}
1374
1375static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1376{
1377 lw->weight += inc;
1378 lw->inv_weight = 0;
1379}
1380
1381static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1382{
1383 lw->weight -= dec;
1384 lw->inv_weight = 0;
1385}
1386
1387static inline void update_load_set(struct load_weight *lw, unsigned long w)
1388{
1389 lw->weight = w;
1390 lw->inv_weight = 0;
1391}
1392
1393/* 649/*
1394 * To aid in avoiding the subversion of "niceness" due to uneven distribution 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1395 * of tasks with abnormal "nice" values across CPUs the contribution that 651 * node and @up when leaving it for the final time.
1396 * each task makes to its run queue's load is weighted according to its
1397 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1398 * scaled version of the new time slice allocation that they receive on time
1399 * slice expiry etc.
1400 */
1401
1402#define WEIGHT_IDLEPRIO 3
1403#define WMULT_IDLEPRIO 1431655765
1404
1405/*
1406 * Nice levels are multiplicative, with a gentle 10% change for every
1407 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1408 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1409 * that remained on nice 0.
1410 * 652 *
1411 * The "10% effect" is relative and cumulative: from _any_ nice level, 653 * Caller must hold rcu_lock or sufficient equivalent.
1412 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1413 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1414 * If a task goes up by ~10% and another task goes down by ~10% then
1415 * the relative distance between them is ~25%.)
1416 */
1417static const int prio_to_weight[40] = {
1418 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1419 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1420 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1421 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1422 /* 0 */ 1024, 820, 655, 526, 423,
1423 /* 5 */ 335, 272, 215, 172, 137,
1424 /* 10 */ 110, 87, 70, 56, 45,
1425 /* 15 */ 36, 29, 23, 18, 15,
1426};
1427
1428/*
1429 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1430 *
1431 * In cases where the weight does not change often, we can use the
1432 * precalculated inverse to speed up arithmetics by turning divisions
1433 * into multiplications:
1434 */
1435static const u32 prio_to_wmult[40] = {
1436 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1437 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1438 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1439 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1440 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1441 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1442 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1443 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1444};
1445
1446/* Time spent by the tasks of the cpu accounting group executing in ... */
1447enum cpuacct_stat_index {
1448 CPUACCT_STAT_USER, /* ... user mode */
1449 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1450
1451 CPUACCT_STAT_NSTATS,
1452};
1453
1454#ifdef CONFIG_CGROUP_CPUACCT
1455static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1456static void cpuacct_update_stats(struct task_struct *tsk,
1457 enum cpuacct_stat_index idx, cputime_t val);
1458#else
1459static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1460static inline void cpuacct_update_stats(struct task_struct *tsk,
1461 enum cpuacct_stat_index idx, cputime_t val) {}
1462#endif
1463
1464static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1465{
1466 update_load_add(&rq->load, load);
1467}
1468
1469static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1470{
1471 update_load_sub(&rq->load, load);
1472}
1473
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1475typedef int (*tg_visitor)(struct task_group *, void *);
1476
1477/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when
1479 * leaving it for the final time.
1480 */ 654 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 655int walk_tg_tree_from(struct task_group *from,
656 tg_visitor down, tg_visitor up, void *data)
1482{ 657{
1483 struct task_group *parent, *child; 658 struct task_group *parent, *child;
1484 int ret; 659 int ret;
1485 660
1486 rcu_read_lock(); 661 parent = from;
1487 parent = &root_task_group; 662
1488down: 663down:
1489 ret = (*down)(parent, data); 664 ret = (*down)(parent, data);
1490 if (ret) 665 if (ret)
1491 goto out_unlock; 666 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 667 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 668 parent = child;
1494 goto down; 669 goto down;
@@ -1497,273 +672,24 @@ up:
1497 continue; 672 continue;
1498 } 673 }
1499 ret = (*up)(parent, data); 674 ret = (*up)(parent, data);
1500 if (ret) 675 if (ret || parent == from)
1501 goto out_unlock; 676 goto out;
1502 677
1503 child = parent; 678 child = parent;
1504 parent = parent->parent; 679 parent = parent->parent;
1505 if (parent) 680 if (parent)
1506 goto up; 681 goto up;
1507out_unlock: 682out:
1508 rcu_read_unlock();
1509
1510 return ret; 683 return ret;
1511} 684}
1512 685
1513static int tg_nop(struct task_group *tg, void *data) 686int tg_nop(struct task_group *tg, void *data)
1514{ 687{
1515 return 0; 688 return 0;
1516} 689}
1517#endif 690#endif
1518 691
1519#ifdef CONFIG_SMP 692void update_cpu_load(struct rq *this_rq);
1520/* Used instead of source_load when we know the type == 0 */
1521static unsigned long weighted_cpuload(const int cpu)
1522{
1523 return cpu_rq(cpu)->load.weight;
1524}
1525
1526/*
1527 * Return a low guess at the load of a migration-source cpu weighted
1528 * according to the scheduling class and "nice" value.
1529 *
1530 * We want to under-estimate the load of migration sources, to
1531 * balance conservatively.
1532 */
1533static unsigned long source_load(int cpu, int type)
1534{
1535 struct rq *rq = cpu_rq(cpu);
1536 unsigned long total = weighted_cpuload(cpu);
1537
1538 if (type == 0 || !sched_feat(LB_BIAS))
1539 return total;
1540
1541 return min(rq->cpu_load[type-1], total);
1542}
1543
1544/*
1545 * Return a high guess at the load of a migration-target cpu weighted
1546 * according to the scheduling class and "nice" value.
1547 */
1548static unsigned long target_load(int cpu, int type)
1549{
1550 struct rq *rq = cpu_rq(cpu);
1551 unsigned long total = weighted_cpuload(cpu);
1552
1553 if (type == 0 || !sched_feat(LB_BIAS))
1554 return total;
1555
1556 return max(rq->cpu_load[type-1], total);
1557}
1558
1559static unsigned long power_of(int cpu)
1560{
1561 return cpu_rq(cpu)->cpu_power;
1562}
1563
1564static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1565
1566static unsigned long cpu_avg_load_per_task(int cpu)
1567{
1568 struct rq *rq = cpu_rq(cpu);
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570
1571 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575
1576 return rq->avg_load_per_task;
1577}
1578
1579#ifdef CONFIG_PREEMPT
1580
1581static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1582
1583/*
1584 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1585 * way at the expense of forcing extra atomic operations in all
1586 * invocations. This assures that the double_lock is acquired using the
1587 * same underlying policy as the spinlock_t on this architecture, which
1588 * reduces latency compared to the unfair variant below. However, it
1589 * also adds more overhead and therefore may reduce throughput.
1590 */
1591static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1592 __releases(this_rq->lock)
1593 __acquires(busiest->lock)
1594 __acquires(this_rq->lock)
1595{
1596 raw_spin_unlock(&this_rq->lock);
1597 double_rq_lock(this_rq, busiest);
1598
1599 return 1;
1600}
1601
1602#else
1603/*
1604 * Unfair double_lock_balance: Optimizes throughput at the expense of
1605 * latency by eliminating extra atomic operations when the locks are
1606 * already in proper order on entry. This favors lower cpu-ids and will
1607 * grant the double lock to lower cpus over higher ids under contention,
1608 * regardless of entry order into the function.
1609 */
1610static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1611 __releases(this_rq->lock)
1612 __acquires(busiest->lock)
1613 __acquires(this_rq->lock)
1614{
1615 int ret = 0;
1616
1617 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 raw_spin_unlock(&this_rq->lock);
1620 raw_spin_lock(&busiest->lock);
1621 raw_spin_lock_nested(&this_rq->lock,
1622 SINGLE_DEPTH_NESTING);
1623 ret = 1;
1624 } else
1625 raw_spin_lock_nested(&busiest->lock,
1626 SINGLE_DEPTH_NESTING);
1627 }
1628 return ret;
1629}
1630
1631#endif /* CONFIG_PREEMPT */
1632
1633/*
1634 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1635 */
1636static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637{
1638 if (unlikely(!irqs_disabled())) {
1639 /* printk() doesn't work good under rq->lock */
1640 raw_spin_unlock(&this_rq->lock);
1641 BUG_ON(1);
1642 }
1643
1644 return _double_lock_balance(this_rq, busiest);
1645}
1646
1647static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1648 __releases(busiest->lock)
1649{
1650 raw_spin_unlock(&busiest->lock);
1651 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1652}
1653
1654/*
1655 * double_rq_lock - safely lock two runqueues
1656 *
1657 * Note this does not disable interrupts like task_rq_lock,
1658 * you need to do so manually before calling.
1659 */
1660static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1661 __acquires(rq1->lock)
1662 __acquires(rq2->lock)
1663{
1664 BUG_ON(!irqs_disabled());
1665 if (rq1 == rq2) {
1666 raw_spin_lock(&rq1->lock);
1667 __acquire(rq2->lock); /* Fake it out ;) */
1668 } else {
1669 if (rq1 < rq2) {
1670 raw_spin_lock(&rq1->lock);
1671 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1672 } else {
1673 raw_spin_lock(&rq2->lock);
1674 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1675 }
1676 }
1677}
1678
1679/*
1680 * double_rq_unlock - safely unlock two runqueues
1681 *
1682 * Note this does not restore interrupts like task_rq_unlock,
1683 * you need to do so manually after calling.
1684 */
1685static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __releases(rq1->lock)
1687 __releases(rq2->lock)
1688{
1689 raw_spin_unlock(&rq1->lock);
1690 if (rq1 != rq2)
1691 raw_spin_unlock(&rq2->lock);
1692 else
1693 __release(rq2->lock);
1694}
1695
1696#else /* CONFIG_SMP */
1697
1698/*
1699 * double_rq_lock - safely lock two runqueues
1700 *
1701 * Note this does not disable interrupts like task_rq_lock,
1702 * you need to do so manually before calling.
1703 */
1704static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1705 __acquires(rq1->lock)
1706 __acquires(rq2->lock)
1707{
1708 BUG_ON(!irqs_disabled());
1709 BUG_ON(rq1 != rq2);
1710 raw_spin_lock(&rq1->lock);
1711 __acquire(rq2->lock); /* Fake it out ;) */
1712}
1713
1714/*
1715 * double_rq_unlock - safely unlock two runqueues
1716 *
1717 * Note this does not restore interrupts like task_rq_unlock,
1718 * you need to do so manually after calling.
1719 */
1720static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1721 __releases(rq1->lock)
1722 __releases(rq2->lock)
1723{
1724 BUG_ON(rq1 != rq2);
1725 raw_spin_unlock(&rq1->lock);
1726 __release(rq2->lock);
1727}
1728
1729#endif
1730
1731static void calc_load_account_idle(struct rq *this_rq);
1732static void update_sysctl(void);
1733static int get_update_sysctl_factor(void);
1734static void update_cpu_load(struct rq *this_rq);
1735
1736static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1737{
1738 set_task_rq(p, cpu);
1739#ifdef CONFIG_SMP
1740 /*
1741 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1742 * successfuly executed on another CPU. We must ensure that updates of
1743 * per-task data have been completed by this moment.
1744 */
1745 smp_wmb();
1746 task_thread_info(p)->cpu = cpu;
1747#endif
1748}
1749
1750static const struct sched_class rt_sched_class;
1751
1752#define sched_class_highest (&stop_sched_class)
1753#define for_each_class(class) \
1754 for (class = sched_class_highest; class; class = class->next)
1755
1756#include "sched_stats.h"
1757
1758static void inc_nr_running(struct rq *rq)
1759{
1760 rq->nr_running++;
1761}
1762
1763static void dec_nr_running(struct rq *rq)
1764{
1765 rq->nr_running--;
1766}
1767 693
1768static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1769{ 695{
@@ -1800,25 +726,23 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1800/* 726/*
1801 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1802 */ 728 */
1803static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1804{ 730{
1805 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1806 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
1807 733
1808 enqueue_task(rq, p, flags); 734 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 735}
1811 736
1812/* 737/*
1813 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1814 */ 739 */
1815static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1816{ 741{
1817 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1818 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
1819 744
1820 dequeue_task(rq, p, flags); 745 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 746}
1823 747
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 748#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2004,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2004#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2005static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2006{ 930{
2007 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2008 unsigned long flags; 932 unsigned long flags;
2009 u64 latest_ns; 933 u64 latest_ns;
2010 int ret = 0; 934 int ret = 0;
2011 935
2012 local_irq_save(flags); 936 local_irq_save(flags);
2013 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2014 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2015 ret = 1; 939 ret = 1;
2016 local_irq_restore(flags); 940 local_irq_restore(flags);
2017 return ret; 941 return ret;
@@ -2019,14 +943,14 @@ static int irqtime_account_hi_update(void)
2019 943
2020static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2021{ 945{
2022 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2023 unsigned long flags; 947 unsigned long flags;
2024 u64 latest_ns; 948 u64 latest_ns;
2025 int ret = 0; 949 int ret = 0;
2026 950
2027 local_irq_save(flags); 951 local_irq_save(flags);
2028 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2029 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2030 ret = 1; 954 ret = 1;
2031 local_irq_restore(flags); 955 local_irq_restore(flags);
2032 return ret; 956 return ret;
@@ -2038,15 +962,6 @@ static int irqtime_account_si_update(void)
2038 962
2039#endif 963#endif
2040 964
2041#include "sched_idletask.c"
2042#include "sched_fair.c"
2043#include "sched_rt.c"
2044#include "sched_autogroup.c"
2045#include "sched_stoptask.c"
2046#ifdef CONFIG_SCHED_DEBUG
2047# include "sched_debug.c"
2048#endif
2049
2050void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2051{ 966{
2052 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2144,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2144 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2145} 1060}
2146 1061
2147static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2148{ 1063{
2149 const struct sched_class *class; 1064 const struct sched_class *class;
2150 1065
@@ -2170,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2170} 1085}
2171 1086
2172#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2173/*
2174 * Is this task likely cache-hot:
2175 */
2176static int
2177task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2178{
2179 s64 delta;
2180
2181 if (p->sched_class != &fair_sched_class)
2182 return 0;
2183
2184 if (unlikely(p->policy == SCHED_IDLE))
2185 return 0;
2186
2187 /*
2188 * Buddy candidates are cache hot:
2189 */
2190 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2191 (&p->se == cfs_rq_of(&p->se)->next ||
2192 &p->se == cfs_rq_of(&p->se)->last))
2193 return 1;
2194
2195 if (sysctl_sched_migration_cost == -1)
2196 return 1;
2197 if (sysctl_sched_migration_cost == 0)
2198 return 0;
2199
2200 delta = now - p->se.exec_start;
2201
2202 return delta < (s64)sysctl_sched_migration_cost;
2203}
2204
2205void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2206{ 1089{
2207#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -2390,11 +1273,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 1273
2391 /* Look for allowed, online CPU in same node. */ 1274 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 1275 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 1276 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 1277 return dest_cpu;
2395 1278
2396 /* Any allowed, online CPU? */ 1279 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 1280 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 1281 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 1282 return dest_cpu;
2400 1283
@@ -2431,7 +1314,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 1314 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 1315 * not worry about this generic constraint ]
2433 */ 1316 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 1317 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 1318 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 1319 cpu = select_fallback_rq(task_cpu(p), p);
2437 1320
@@ -2556,42 +1439,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 1439}
2557 1440
2558#ifdef CONFIG_SMP 1441#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 1442static void sched_ttwu_pending(void)
2560{ 1443{
2561 struct rq *rq = this_rq(); 1444 struct rq *rq = this_rq();
1445 struct llist_node *llist = llist_del_all(&rq->wake_list);
1446 struct task_struct *p;
2562 1447
2563 raw_spin_lock(&rq->lock); 1448 raw_spin_lock(&rq->lock);
2564 1449
2565 while (list) { 1450 while (llist) {
2566 struct task_struct *p = list; 1451 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 1452 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 1453 ttwu_do_activate(rq, p, 0);
2569 } 1454 }
2570 1455
2571 raw_spin_unlock(&rq->lock); 1456 raw_spin_unlock(&rq->lock);
2572} 1457}
2573 1458
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 1459void scheduler_ipi(void)
2590{ 1460{
2591 struct rq *rq = this_rq(); 1461 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 1462 return;
2596 1463
2597 /* 1464 /*
@@ -2608,25 +1475,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 1475 * somewhat pessimize the simple resched case.
2609 */ 1476 */
2610 irq_enter(); 1477 irq_enter();
2611 sched_ttwu_do_pending(list); 1478 sched_ttwu_pending();
1479
1480 /*
1481 * Check if someone kicked us for doing the nohz idle load balance.
1482 */
1483 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1484 this_rq()->idle_balance = 1;
1485 raise_softirq_irqoff(SCHED_SOFTIRQ);
1486 }
2612 irq_exit(); 1487 irq_exit();
2613} 1488}
2614 1489
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 1490static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 1491{
2617 struct rq *rq = cpu_rq(cpu); 1492 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 1493 smp_send_reschedule(cpu);
2631} 1494}
2632 1495
@@ -2648,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2648 1511
2649} 1512}
2650#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1516{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518}
2651#endif /* CONFIG_SMP */ 1519#endif /* CONFIG_SMP */
2652 1520
2653static void ttwu_queue(struct task_struct *p, int cpu) 1521static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2655,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2655 struct rq *rq = cpu_rq(cpu); 1523 struct rq *rq = cpu_rq(cpu);
2656 1524
2657#if defined(CONFIG_SMP) 1525#if defined(CONFIG_SMP)
2658 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2659 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2660 ttwu_queue_remote(p, cpu); 1528 ttwu_queue_remote(p, cpu);
2661 return; 1529 return;
@@ -2848,19 +1716,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 1716 p->state = TASK_RUNNING;
2849 1717
2850 /* 1718 /*
1719 * Make sure we do not leak PI boosting priority to the child.
1720 */
1721 p->prio = current->normal_prio;
1722
1723 /*
2851 * Revert to default priority/policy on fork if requested. 1724 * Revert to default priority/policy on fork if requested.
2852 */ 1725 */
2853 if (unlikely(p->sched_reset_on_fork)) { 1726 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 1727 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 1728 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 1729 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 1730 p->rt_priority = 0;
2862 set_load_weight(p); 1731 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 1732 p->static_prio = NICE_TO_PRIO(0);
1733
1734 p->prio = p->normal_prio = __normal_prio(p);
1735 set_load_weight(p);
2864 1736
2865 /* 1737 /*
2866 * We don't need the reset flag anymore after the fork. It has 1738 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +1741,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 1741 p->sched_reset_on_fork = 0;
2870 } 1742 }
2871 1743
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 1744 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 1745 p->sched_class = &fair_sched_class;
2879 1746
@@ -3070,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3070 local_irq_enable(); 1937 local_irq_enable();
3071#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3072 finish_lock_switch(rq, prev); 1939 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
3073 1941
3074 fire_sched_in_preempt_notifiers(current); 1942 fire_sched_in_preempt_notifiers(current);
3075 if (mm) 1943 if (mm)
@@ -3305,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3305 */ 2173 */
3306static atomic_long_t calc_load_tasks_idle; 2174static atomic_long_t calc_load_tasks_idle;
3307 2175
3308static void calc_load_account_idle(struct rq *this_rq) 2176void calc_load_account_idle(struct rq *this_rq)
3309{ 2177{
3310 long delta; 2178 long delta;
3311 2179
@@ -3449,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
3449 */ 2317 */
3450} 2318}
3451#else 2319#else
3452static void calc_load_account_idle(struct rq *this_rq) 2320void calc_load_account_idle(struct rq *this_rq)
3453{ 2321{
3454} 2322}
3455 2323
@@ -3592,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3592 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2460 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3593 * every tick. We fix it up based on jiffies. 2461 * every tick. We fix it up based on jiffies.
3594 */ 2462 */
3595static void update_cpu_load(struct rq *this_rq) 2463void update_cpu_load(struct rq *this_rq)
3596{ 2464{
3597 unsigned long this_load = this_rq->load.weight; 2465 unsigned long this_load = this_rq->load.weight;
3598 unsigned long curr_jiffies = jiffies; 2466 unsigned long curr_jiffies = jiffies;
@@ -3670,8 +2538,10 @@ unlock:
3670#endif 2538#endif
3671 2539
3672DEFINE_PER_CPU(struct kernel_stat, kstat); 2540DEFINE_PER_CPU(struct kernel_stat, kstat);
2541DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3673 2542
3674EXPORT_PER_CPU_SYMBOL(kstat); 2543EXPORT_PER_CPU_SYMBOL(kstat);
2544EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3675 2545
3676/* 2546/*
3677 * Return any ns on the sched_clock that have not yet been accounted in 2547 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3724,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3724 return ns; 2594 return ns;
3725} 2595}
3726 2596
2597#ifdef CONFIG_CGROUP_CPUACCT
2598struct cgroup_subsys cpuacct_subsys;
2599struct cpuacct root_cpuacct;
2600#endif
2601
2602static inline void task_group_account_field(struct task_struct *p, int index,
2603 u64 tmp)
2604{
2605#ifdef CONFIG_CGROUP_CPUACCT
2606 struct kernel_cpustat *kcpustat;
2607 struct cpuacct *ca;
2608#endif
2609 /*
2610 * Since all updates are sure to touch the root cgroup, we
2611 * get ourselves ahead and touch it first. If the root cgroup
2612 * is the only cgroup, then nothing else should be necessary.
2613 *
2614 */
2615 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2616
2617#ifdef CONFIG_CGROUP_CPUACCT
2618 if (unlikely(!cpuacct_subsys.active))
2619 return;
2620
2621 rcu_read_lock();
2622 ca = task_ca(p);
2623 while (ca && (ca != &root_cpuacct)) {
2624 kcpustat = this_cpu_ptr(ca->cpustat);
2625 kcpustat->cpustat[index] += tmp;
2626 ca = parent_ca(ca);
2627 }
2628 rcu_read_unlock();
2629#endif
2630}
2631
2632
3727/* 2633/*
3728 * Account user cpu time to a process. 2634 * Account user cpu time to a process.
3729 * @p: the process that the cpu time gets accounted to 2635 * @p: the process that the cpu time gets accounted to
@@ -3733,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3733void account_user_time(struct task_struct *p, cputime_t cputime, 2639void account_user_time(struct task_struct *p, cputime_t cputime,
3734 cputime_t cputime_scaled) 2640 cputime_t cputime_scaled)
3735{ 2641{
3736 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2642 int index;
3737 cputime64_t tmp;
3738 2643
3739 /* Add user time to process. */ 2644 /* Add user time to process. */
3740 p->utime = cputime_add(p->utime, cputime); 2645 p->utime += cputime;
3741 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2646 p->utimescaled += cputime_scaled;
3742 account_group_user_time(p, cputime); 2647 account_group_user_time(p, cputime);
3743 2648
2649 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2650
3744 /* Add user time to cpustat. */ 2651 /* Add user time to cpustat. */
3745 tmp = cputime_to_cputime64(cputime); 2652 task_group_account_field(p, index, (__force u64) cputime);
3746 if (TASK_NICE(p) > 0)
3747 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3748 else
3749 cpustat->user = cputime64_add(cpustat->user, tmp);
3750 2653
3751 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3752 /* Account for user time used */ 2654 /* Account for user time used */
3753 acct_update_integrals(p); 2655 acct_update_integrals(p);
3754} 2656}
@@ -3762,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3762static void account_guest_time(struct task_struct *p, cputime_t cputime, 2664static void account_guest_time(struct task_struct *p, cputime_t cputime,
3763 cputime_t cputime_scaled) 2665 cputime_t cputime_scaled)
3764{ 2666{
3765 cputime64_t tmp; 2667 u64 *cpustat = kcpustat_this_cpu->cpustat;
3766 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3767
3768 tmp = cputime_to_cputime64(cputime);
3769 2668
3770 /* Add guest time to process. */ 2669 /* Add guest time to process. */
3771 p->utime = cputime_add(p->utime, cputime); 2670 p->utime += cputime;
3772 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2671 p->utimescaled += cputime_scaled;
3773 account_group_user_time(p, cputime); 2672 account_group_user_time(p, cputime);
3774 p->gtime = cputime_add(p->gtime, cputime); 2673 p->gtime += cputime;
3775 2674
3776 /* Add guest time to cpustat. */ 2675 /* Add guest time to cpustat. */
3777 if (TASK_NICE(p) > 0) { 2676 if (TASK_NICE(p) > 0) {
3778 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2677 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3779 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2678 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3780 } else { 2679 } else {
3781 cpustat->user = cputime64_add(cpustat->user, tmp); 2680 cpustat[CPUTIME_USER] += (__force u64) cputime;
3782 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2681 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3783 } 2682 }
3784} 2683}
3785 2684
@@ -3792,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3792 */ 2691 */
3793static inline 2692static inline
3794void __account_system_time(struct task_struct *p, cputime_t cputime, 2693void __account_system_time(struct task_struct *p, cputime_t cputime,
3795 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2694 cputime_t cputime_scaled, int index)
3796{ 2695{
3797 cputime64_t tmp = cputime_to_cputime64(cputime);
3798
3799 /* Add system time to process. */ 2696 /* Add system time to process. */
3800 p->stime = cputime_add(p->stime, cputime); 2697 p->stime += cputime;
3801 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2698 p->stimescaled += cputime_scaled;
3802 account_group_system_time(p, cputime); 2699 account_group_system_time(p, cputime);
3803 2700
3804 /* Add system time to cpustat. */ 2701 /* Add system time to cpustat. */
3805 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2702 task_group_account_field(p, index, (__force u64) cputime);
3806 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3807 2703
3808 /* Account for system time used */ 2704 /* Account for system time used */
3809 acct_update_integrals(p); 2705 acct_update_integrals(p);
@@ -3819,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3819void account_system_time(struct task_struct *p, int hardirq_offset, 2715void account_system_time(struct task_struct *p, int hardirq_offset,
3820 cputime_t cputime, cputime_t cputime_scaled) 2716 cputime_t cputime, cputime_t cputime_scaled)
3821{ 2717{
3822 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2718 int index;
3823 cputime64_t *target_cputime64;
3824 2719
3825 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2720 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3826 account_guest_time(p, cputime, cputime_scaled); 2721 account_guest_time(p, cputime, cputime_scaled);
@@ -3828,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3828 } 2723 }
3829 2724
3830 if (hardirq_count() - hardirq_offset) 2725 if (hardirq_count() - hardirq_offset)
3831 target_cputime64 = &cpustat->irq; 2726 index = CPUTIME_IRQ;
3832 else if (in_serving_softirq()) 2727 else if (in_serving_softirq())
3833 target_cputime64 = &cpustat->softirq; 2728 index = CPUTIME_SOFTIRQ;
3834 else 2729 else
3835 target_cputime64 = &cpustat->system; 2730 index = CPUTIME_SYSTEM;
3836 2731
3837 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2732 __account_system_time(p, cputime, cputime_scaled, index);
3838} 2733}
3839 2734
3840/* 2735/*
@@ -3843,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3843 */ 2738 */
3844void account_steal_time(cputime_t cputime) 2739void account_steal_time(cputime_t cputime)
3845{ 2740{
3846 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2741 u64 *cpustat = kcpustat_this_cpu->cpustat;
3847 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3848 2742
3849 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2743 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3850} 2744}
3851 2745
3852/* 2746/*
@@ -3855,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
3855 */ 2749 */
3856void account_idle_time(cputime_t cputime) 2750void account_idle_time(cputime_t cputime)
3857{ 2751{
3858 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2752 u64 *cpustat = kcpustat_this_cpu->cpustat;
3859 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3860 struct rq *rq = this_rq(); 2753 struct rq *rq = this_rq();
3861 2754
3862 if (atomic_read(&rq->nr_iowait) > 0) 2755 if (atomic_read(&rq->nr_iowait) > 0)
3863 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2756 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3864 else 2757 else
3865 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2758 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
3866} 2759}
3867 2760
3868static __always_inline bool steal_account_process_tick(void) 2761static __always_inline bool steal_account_process_tick(void)
@@ -3912,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3912 struct rq *rq) 2805 struct rq *rq)
3913{ 2806{
3914 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2807 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3915 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2808 u64 *cpustat = kcpustat_this_cpu->cpustat;
3916 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3917 2809
3918 if (steal_account_process_tick()) 2810 if (steal_account_process_tick())
3919 return; 2811 return;
3920 2812
3921 if (irqtime_account_hi_update()) { 2813 if (irqtime_account_hi_update()) {
3922 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2814 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3923 } else if (irqtime_account_si_update()) { 2815 } else if (irqtime_account_si_update()) {
3924 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2816 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3925 } else if (this_cpu_ksoftirqd() == p) { 2817 } else if (this_cpu_ksoftirqd() == p) {
3926 /* 2818 /*
3927 * ksoftirqd time do not get accounted in cpu_softirq_time. 2819 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -3929,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3929 * Also, p->stime needs to be updated for ksoftirqd. 2821 * Also, p->stime needs to be updated for ksoftirqd.
3930 */ 2822 */
3931 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2823 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3932 &cpustat->softirq); 2824 CPUTIME_SOFTIRQ);
3933 } else if (user_tick) { 2825 } else if (user_tick) {
3934 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2826 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3935 } else if (p == rq->idle) { 2827 } else if (p == rq->idle) {
@@ -3938,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3938 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2830 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3939 } else { 2831 } else {
3940 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2832 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3941 &cpustat->system); 2833 CPUTIME_SYSTEM);
3942 } 2834 }
3943} 2835}
3944 2836
@@ -4037,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4037 2929
4038void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2930void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4039{ 2931{
4040 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2932 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4041 2933
4042 /* 2934 /*
4043 * Use CFS's precise accounting: 2935 * Use CFS's precise accounting:
@@ -4045,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4045 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2937 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4046 2938
4047 if (total) { 2939 if (total) {
4048 u64 temp = rtime; 2940 u64 temp = (__force u64) rtime;
4049 2941
4050 temp *= utime; 2942 temp *= (__force u64) utime;
4051 do_div(temp, total); 2943 do_div(temp, (__force u32) total);
4052 utime = (cputime_t)temp; 2944 utime = (__force cputime_t) temp;
4053 } else 2945 } else
4054 utime = rtime; 2946 utime = rtime;
4055 2947
@@ -4057,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4057 * Compare with previous values, to keep monotonicity: 2949 * Compare with previous values, to keep monotonicity:
4058 */ 2950 */
4059 p->prev_utime = max(p->prev_utime, utime); 2951 p->prev_utime = max(p->prev_utime, utime);
4060 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2952 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4061 2953
4062 *ut = p->prev_utime; 2954 *ut = p->prev_utime;
4063 *st = p->prev_stime; 2955 *st = p->prev_stime;
@@ -4074,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4074 2966
4075 thread_group_cputime(p, &cputime); 2967 thread_group_cputime(p, &cputime);
4076 2968
4077 total = cputime_add(cputime.utime, cputime.stime); 2969 total = cputime.utime + cputime.stime;
4078 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2970 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4079 2971
4080 if (total) { 2972 if (total) {
4081 u64 temp = rtime; 2973 u64 temp = (__force u64) rtime;
4082 2974
4083 temp *= cputime.utime; 2975 temp *= (__force u64) cputime.utime;
4084 do_div(temp, total); 2976 do_div(temp, (__force u32) total);
4085 utime = (cputime_t)temp; 2977 utime = (__force cputime_t) temp;
4086 } else 2978 } else
4087 utime = rtime; 2979 utime = rtime;
4088 2980
4089 sig->prev_utime = max(sig->prev_utime, utime); 2981 sig->prev_utime = max(sig->prev_utime, utime);
4090 sig->prev_stime = max(sig->prev_stime, 2982 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4091 cputime_sub(rtime, sig->prev_utime));
4092 2983
4093 *ut = sig->prev_utime; 2984 *ut = sig->prev_utime;
4094 *st = sig->prev_stime; 2985 *st = sig->prev_stime;
@@ -4116,7 +3007,7 @@ void scheduler_tick(void)
4116 perf_event_task_tick(); 3007 perf_event_task_tick();
4117 3008
4118#ifdef CONFIG_SMP 3009#ifdef CONFIG_SMP
4119 rq->idle_at_tick = idle_cpu(cpu); 3010 rq->idle_balance = idle_cpu(cpu);
4120 trigger_load_balance(rq, cpu); 3011 trigger_load_balance(rq, cpu);
4121#endif 3012#endif
4122} 3013}
@@ -4187,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4187{ 3078{
4188 struct pt_regs *regs = get_irq_regs(); 3079 struct pt_regs *regs = get_irq_regs();
4189 3080
3081 if (oops_in_progress)
3082 return;
3083
4190 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3084 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4191 prev->comm, prev->pid, preempt_count()); 3085 prev->comm, prev->pid, preempt_count());
4192 3086
@@ -4213,6 +3107,7 @@ static inline void schedule_debug(struct task_struct *prev)
4213 */ 3107 */
4214 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 3108 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4215 __schedule_bug(prev); 3109 __schedule_bug(prev);
3110 rcu_sleep_check();
4216 3111
4217 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3112 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4218 3113
@@ -4239,7 +3134,7 @@ pick_next_task(struct rq *rq)
4239 * Optimization: we know that if all tasks are in 3134 * Optimization: we know that if all tasks are in
4240 * the fair class we can call that function directly: 3135 * the fair class we can call that function directly:
4241 */ 3136 */
4242 if (likely(rq->nr_running == rq->cfs.nr_running)) { 3137 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4243 p = fair_sched_class.pick_next_task(rq); 3138 p = fair_sched_class.pick_next_task(rq);
4244 if (likely(p)) 3139 if (likely(p))
4245 return p; 3140 return p;
@@ -4676,6 +3571,9 @@ EXPORT_SYMBOL(wait_for_completion);
4676 * This waits for either a completion of a specific task to be signaled or for a 3571 * This waits for either a completion of a specific task to be signaled or for a
4677 * specified timeout to expire. The timeout is in jiffies. It is not 3572 * specified timeout to expire. The timeout is in jiffies. It is not
4678 * interruptible. 3573 * interruptible.
3574 *
3575 * The return value is 0 if timed out, and positive (at least 1, or number of
3576 * jiffies left till timeout) if completed.
4679 */ 3577 */
4680unsigned long __sched 3578unsigned long __sched
4681wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3579wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4690,6 +3588,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
4690 * 3588 *
4691 * This waits for completion of a specific task to be signaled. It is 3589 * This waits for completion of a specific task to be signaled. It is
4692 * interruptible. 3590 * interruptible.
3591 *
3592 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4693 */ 3593 */
4694int __sched wait_for_completion_interruptible(struct completion *x) 3594int __sched wait_for_completion_interruptible(struct completion *x)
4695{ 3595{
@@ -4707,6 +3607,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4707 * 3607 *
4708 * This waits for either a completion of a specific task to be signaled or for a 3608 * This waits for either a completion of a specific task to be signaled or for a
4709 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 3609 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3610 *
3611 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3612 * positive (at least 1, or number of jiffies left till timeout) if completed.
4710 */ 3613 */
4711long __sched 3614long __sched
4712wait_for_completion_interruptible_timeout(struct completion *x, 3615wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4722,6 +3625,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4722 * 3625 *
4723 * This waits to be signaled for completion of a specific task. It can be 3626 * This waits to be signaled for completion of a specific task. It can be
4724 * interrupted by a kill signal. 3627 * interrupted by a kill signal.
3628 *
3629 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4725 */ 3630 */
4726int __sched wait_for_completion_killable(struct completion *x) 3631int __sched wait_for_completion_killable(struct completion *x)
4727{ 3632{
@@ -4740,6 +3645,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4740 * This waits for either a completion of a specific task to be 3645 * This waits for either a completion of a specific task to be
4741 * signaled or for a specified timeout to expire. It can be 3646 * signaled or for a specified timeout to expire. It can be
4742 * interrupted by a kill signal. The timeout is in jiffies. 3647 * interrupted by a kill signal. The timeout is in jiffies.
3648 *
3649 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3650 * positive (at least 1, or number of jiffies left till timeout) if completed.
4743 */ 3651 */
4744long __sched 3652long __sched
4745wait_for_completion_killable_timeout(struct completion *x, 3653wait_for_completion_killable_timeout(struct completion *x,
@@ -5025,7 +3933,20 @@ EXPORT_SYMBOL(task_nice);
5025 */ 3933 */
5026int idle_cpu(int cpu) 3934int idle_cpu(int cpu)
5027{ 3935{
5028 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 3936 struct rq *rq = cpu_rq(cpu);
3937
3938 if (rq->curr != rq->idle)
3939 return 0;
3940
3941 if (rq->nr_running)
3942 return 0;
3943
3944#ifdef CONFIG_SMP
3945 if (!llist_empty(&rq->wake_list))
3946 return 0;
3947#endif
3948
3949 return 1;
5029} 3950}
5030 3951
5031/** 3952/**
@@ -5691,6 +4612,13 @@ again:
5691 */ 4612 */
5692 if (preempt && rq != p_rq) 4613 if (preempt && rq != p_rq)
5693 resched_task(p_rq->curr); 4614 resched_task(p_rq->curr);
4615 } else {
4616 /*
4617 * We might have set it in task_yield_fair(), but are
4618 * not going to schedule(), so don't want to skip
4619 * the next update.
4620 */
4621 rq->skip_clock_update = 0;
5694 } 4622 }
5695 4623
5696out: 4624out:
@@ -5858,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
5858 free = stack_not_used(p); 4786 free = stack_not_used(p);
5859#endif 4787#endif
5860 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4788 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5861 task_pid_nr(p), task_pid_nr(p->real_parent), 4789 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
5862 (unsigned long)task_thread_info(p)->flags); 4790 (unsigned long)task_thread_info(p)->flags);
5863 4791
5864 show_stack(p, NULL); 4792 show_stack(p, NULL);
@@ -5875,7 +4803,7 @@ void show_state_filter(unsigned long state_filter)
5875 printk(KERN_INFO 4803 printk(KERN_INFO
5876 " task PC stack pid father\n"); 4804 " task PC stack pid father\n");
5877#endif 4805#endif
5878 read_lock(&tasklist_lock); 4806 rcu_read_lock();
5879 do_each_thread(g, p) { 4807 do_each_thread(g, p) {
5880 /* 4808 /*
5881 * reset the NMI-timeout, listing all files on a slow 4809 * reset the NMI-timeout, listing all files on a slow
@@ -5891,7 +4819,7 @@ void show_state_filter(unsigned long state_filter)
5891#ifdef CONFIG_SCHED_DEBUG 4819#ifdef CONFIG_SCHED_DEBUG
5892 sysrq_sched_debug_show(); 4820 sysrq_sched_debug_show();
5893#endif 4821#endif
5894 read_unlock(&tasklist_lock); 4822 rcu_read_unlock();
5895 /* 4823 /*
5896 * Only show locks if all tasks are dumped: 4824 * Only show locks if all tasks are dumped:
5897 */ 4825 */
@@ -5952,62 +4880,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5952 */ 4880 */
5953 idle->sched_class = &idle_sched_class; 4881 idle->sched_class = &idle_sched_class;
5954 ftrace_graph_init_idle_task(idle, cpu); 4882 ftrace_graph_init_idle_task(idle, cpu);
5955} 4883#if defined(CONFIG_SMP)
5956 4884 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5957/* 4885#endif
5958 * In a system that switches off the HZ timer nohz_cpu_mask
5959 * indicates which cpus entered this state. This is used
5960 * in the rcu update to wait only for active cpus. For system
5961 * which do not switch off the HZ timer nohz_cpu_mask should
5962 * always be CPU_BITS_NONE.
5963 */
5964cpumask_var_t nohz_cpu_mask;
5965
5966/*
5967 * Increase the granularity value when there are more CPUs,
5968 * because with more CPUs the 'effective latency' as visible
5969 * to users decreases. But the relationship is not linear,
5970 * so pick a second-best guess by going with the log2 of the
5971 * number of CPUs.
5972 *
5973 * This idea comes from the SD scheduler of Con Kolivas:
5974 */
5975static int get_update_sysctl_factor(void)
5976{
5977 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5978 unsigned int factor;
5979
5980 switch (sysctl_sched_tunable_scaling) {
5981 case SCHED_TUNABLESCALING_NONE:
5982 factor = 1;
5983 break;
5984 case SCHED_TUNABLESCALING_LINEAR:
5985 factor = cpus;
5986 break;
5987 case SCHED_TUNABLESCALING_LOG:
5988 default:
5989 factor = 1 + ilog2(cpus);
5990 break;
5991 }
5992
5993 return factor;
5994}
5995
5996static void update_sysctl(void)
5997{
5998 unsigned int factor = get_update_sysctl_factor();
5999
6000#define SET_SYSCTL(name) \
6001 (sysctl_##name = (factor) * normalized_sysctl_##name)
6002 SET_SYSCTL(sched_min_granularity);
6003 SET_SYSCTL(sched_latency);
6004 SET_SYSCTL(sched_wakeup_granularity);
6005#undef SET_SYSCTL
6006}
6007
6008static inline void sched_init_granularity(void)
6009{
6010 update_sysctl();
6011} 4886}
6012 4887
6013#ifdef CONFIG_SMP 4888#ifdef CONFIG_SMP
@@ -6015,10 +4890,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6015{ 4890{
6016 if (p->sched_class && p->sched_class->set_cpus_allowed) 4891 if (p->sched_class && p->sched_class->set_cpus_allowed)
6017 p->sched_class->set_cpus_allowed(p, new_mask); 4892 p->sched_class->set_cpus_allowed(p, new_mask);
6018 else { 4893
6019 cpumask_copy(&p->cpus_allowed, new_mask); 4894 cpumask_copy(&p->cpus_allowed, new_mask);
6020 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 4895 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6021 }
6022} 4896}
6023 4897
6024/* 4898/*
@@ -6116,7 +4990,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6116 if (task_cpu(p) != src_cpu) 4990 if (task_cpu(p) != src_cpu)
6117 goto done; 4991 goto done;
6118 /* Affinity changed (again). */ 4992 /* Affinity changed (again). */
6119 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 4993 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6120 goto fail; 4994 goto fail;
6121 4995
6122 /* 4996 /*
@@ -6222,6 +5096,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6222 */ 5096 */
6223 rq->stop = NULL; 5097 rq->stop = NULL;
6224 5098
5099 /* Ensure any throttled groups are reachable by pick_next_task */
5100 unthrottle_offline_cfs_rqs(rq);
5101
6225 for ( ; ; ) { 5102 for ( ; ; ) {
6226 /* 5103 /*
6227 * There's this thread running, bail when that's the only 5104 * There's this thread running, bail when that's the only
@@ -6299,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
6299static void 5176static void
6300set_table_entry(struct ctl_table *entry, 5177set_table_entry(struct ctl_table *entry,
6301 const char *procname, void *data, int maxlen, 5178 const char *procname, void *data, int maxlen,
6302 mode_t mode, proc_handler *proc_handler) 5179 umode_t mode, proc_handler *proc_handler)
6303{ 5180{
6304 entry->procname = procname; 5181 entry->procname = procname;
6305 entry->data = data; 5182 entry->data = data;
@@ -6799,6 +5676,12 @@ out:
6799 return -ENOMEM; 5676 return -ENOMEM;
6800} 5677}
6801 5678
5679/*
5680 * By default the system creates a single root-domain with all cpus as
5681 * members (mimicking the global state we have today).
5682 */
5683struct root_domain def_root_domain;
5684
6802static void init_defrootdomain(void) 5685static void init_defrootdomain(void)
6803{ 5686{
6804 init_rootdomain(&def_root_domain); 5687 init_rootdomain(&def_root_domain);
@@ -6870,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6870} 5753}
6871 5754
6872/* 5755/*
5756 * Keep a special pointer to the highest sched_domain that has
5757 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5758 * allows us to avoid some pointer chasing select_idle_sibling().
5759 *
5760 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache().
5763 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id);
5766
5767static void update_top_cache_domain(int cpu)
5768{
5769 struct sched_domain *sd;
5770 int id = cpu;
5771
5772 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5773 if (sd)
5774 id = cpumask_first(sched_domain_span(sd));
5775
5776 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5777 per_cpu(sd_llc_id, cpu) = id;
5778}
5779
5780/*
6873 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5781 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6874 * hold the hotplug lock. 5782 * hold the hotplug lock.
6875 */ 5783 */
@@ -6908,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6908 tmp = rq->sd; 5816 tmp = rq->sd;
6909 rcu_assign_pointer(rq->sd, sd); 5817 rcu_assign_pointer(rq->sd, sd);
6910 destroy_sched_domains(tmp, cpu); 5818 destroy_sched_domains(tmp, cpu);
5819
5820 update_top_cache_domain(cpu);
6911} 5821}
6912 5822
6913/* cpus with isolated domains */ 5823/* cpus with isolated domains */
@@ -6923,8 +5833,6 @@ static int __init isolated_cpu_setup(char *str)
6923 5833
6924__setup("isolcpus=", isolated_cpu_setup); 5834__setup("isolcpus=", isolated_cpu_setup);
6925 5835
6926#define SD_NODES_PER_DOMAIN 16
6927
6928#ifdef CONFIG_NUMA 5836#ifdef CONFIG_NUMA
6929 5837
6930/** 5838/**
@@ -7069,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7069 continue; 5977 continue;
7070 5978
7071 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5979 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7072 GFP_KERNEL, cpu_to_node(i)); 5980 GFP_KERNEL, cpu_to_node(cpu));
7073 5981
7074 if (!sg) 5982 if (!sg)
7075 goto fail; 5983 goto fail;
@@ -7207,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7207 return; 6115 return;
7208 6116
7209 update_group_power(sd, cpu); 6117 update_group_power(sd, cpu);
6118 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6119}
6120
6121int __weak arch_sd_sibling_asym_packing(void)
6122{
6123 return 0*SD_ASYM_PACKING;
7210} 6124}
7211 6125
7212/* 6126/*
@@ -7761,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7761} 6675}
7762 6676
7763#ifdef CONFIG_SCHED_MC 6677#ifdef CONFIG_SCHED_MC
7764static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 6678static ssize_t sched_mc_power_savings_show(struct device *dev,
7765 struct sysdev_class_attribute *attr, 6679 struct device_attribute *attr,
7766 char *page) 6680 char *buf)
7767{ 6681{
7768 return sprintf(page, "%u\n", sched_mc_power_savings); 6682 return sprintf(buf, "%u\n", sched_mc_power_savings);
7769} 6683}
7770static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 6684static ssize_t sched_mc_power_savings_store(struct device *dev,
7771 struct sysdev_class_attribute *attr, 6685 struct device_attribute *attr,
7772 const char *buf, size_t count) 6686 const char *buf, size_t count)
7773{ 6687{
7774 return sched_power_savings_store(buf, count, 0); 6688 return sched_power_savings_store(buf, count, 0);
7775} 6689}
7776static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 6690static DEVICE_ATTR(sched_mc_power_savings, 0644,
7777 sched_mc_power_savings_show, 6691 sched_mc_power_savings_show,
7778 sched_mc_power_savings_store); 6692 sched_mc_power_savings_store);
7779#endif 6693#endif
7780 6694
7781#ifdef CONFIG_SCHED_SMT 6695#ifdef CONFIG_SCHED_SMT
7782static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 6696static ssize_t sched_smt_power_savings_show(struct device *dev,
7783 struct sysdev_class_attribute *attr, 6697 struct device_attribute *attr,
7784 char *page) 6698 char *buf)
7785{ 6699{
7786 return sprintf(page, "%u\n", sched_smt_power_savings); 6700 return sprintf(buf, "%u\n", sched_smt_power_savings);
7787} 6701}
7788static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 6702static ssize_t sched_smt_power_savings_store(struct device *dev,
7789 struct sysdev_class_attribute *attr, 6703 struct device_attribute *attr,
7790 const char *buf, size_t count) 6704 const char *buf, size_t count)
7791{ 6705{
7792 return sched_power_savings_store(buf, count, 1); 6706 return sched_power_savings_store(buf, count, 1);
7793} 6707}
7794static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 6708static DEVICE_ATTR(sched_smt_power_savings, 0644,
7795 sched_smt_power_savings_show, 6709 sched_smt_power_savings_show,
7796 sched_smt_power_savings_store); 6710 sched_smt_power_savings_store);
7797#endif 6711#endif
7798 6712
7799int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6713int __init sched_create_sysfs_power_savings_entries(struct device *dev)
7800{ 6714{
7801 int err = 0; 6715 int err = 0;
7802 6716
7803#ifdef CONFIG_SCHED_SMT 6717#ifdef CONFIG_SCHED_SMT
7804 if (smt_capable()) 6718 if (smt_capable())
7805 err = sysfs_create_file(&cls->kset.kobj, 6719 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
7806 &attr_sched_smt_power_savings.attr);
7807#endif 6720#endif
7808#ifdef CONFIG_SCHED_MC 6721#ifdef CONFIG_SCHED_MC
7809 if (!err && mc_capable()) 6722 if (!err && mc_capable())
7810 err = sysfs_create_file(&cls->kset.kobj, 6723 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
7811 &attr_sched_mc_power_savings.attr);
7812#endif 6724#endif
7813 return err; 6725 return err;
7814} 6726}
@@ -7844,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7844 } 6756 }
7845} 6757}
7846 6758
7847static int update_runtime(struct notifier_block *nfb,
7848 unsigned long action, void *hcpu)
7849{
7850 int cpu = (int)(long)hcpu;
7851
7852 switch (action) {
7853 case CPU_DOWN_PREPARE:
7854 case CPU_DOWN_PREPARE_FROZEN:
7855 disable_runtime(cpu_rq(cpu));
7856 return NOTIFY_OK;
7857
7858 case CPU_DOWN_FAILED:
7859 case CPU_DOWN_FAILED_FROZEN:
7860 case CPU_ONLINE:
7861 case CPU_ONLINE_FROZEN:
7862 enable_runtime(cpu_rq(cpu));
7863 return NOTIFY_OK;
7864
7865 default:
7866 return NOTIFY_DONE;
7867 }
7868}
7869
7870void __init sched_init_smp(void) 6759void __init sched_init_smp(void)
7871{ 6760{
7872 cpumask_var_t non_isolated_cpus; 6761 cpumask_var_t non_isolated_cpus;
@@ -7915,103 +6804,11 @@ int in_sched_functions(unsigned long addr)
7915 && addr < (unsigned long)__sched_text_end); 6804 && addr < (unsigned long)__sched_text_end);
7916} 6805}
7917 6806
7918static void init_cfs_rq(struct cfs_rq *cfs_rq) 6807#ifdef CONFIG_CGROUP_SCHED
7919{ 6808struct task_group root_task_group;
7920 cfs_rq->tasks_timeline = RB_ROOT;
7921 INIT_LIST_HEAD(&cfs_rq->tasks);
7922 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7923#ifndef CONFIG_64BIT
7924 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7925#endif
7926}
7927
7928static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7929{
7930 struct rt_prio_array *array;
7931 int i;
7932
7933 array = &rt_rq->active;
7934 for (i = 0; i < MAX_RT_PRIO; i++) {
7935 INIT_LIST_HEAD(array->queue + i);
7936 __clear_bit(i, array->bitmap);
7937 }
7938 /* delimiter for bitsearch: */
7939 __set_bit(MAX_RT_PRIO, array->bitmap);
7940
7941#if defined CONFIG_SMP
7942 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7943 rt_rq->highest_prio.next = MAX_RT_PRIO;
7944 rt_rq->rt_nr_migratory = 0;
7945 rt_rq->overloaded = 0;
7946 plist_head_init(&rt_rq->pushable_tasks);
7947#endif
7948
7949 rt_rq->rt_time = 0;
7950 rt_rq->rt_throttled = 0;
7951 rt_rq->rt_runtime = 0;
7952 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7953}
7954
7955#ifdef CONFIG_FAIR_GROUP_SCHED
7956static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7957 struct sched_entity *se, int cpu,
7958 struct sched_entity *parent)
7959{
7960 struct rq *rq = cpu_rq(cpu);
7961
7962 cfs_rq->tg = tg;
7963 cfs_rq->rq = rq;
7964#ifdef CONFIG_SMP
7965 /* allow initial update_cfs_load() to truncate */
7966 cfs_rq->load_stamp = 1;
7967#endif 6809#endif
7968 6810
7969 tg->cfs_rq[cpu] = cfs_rq; 6811DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
7970 tg->se[cpu] = se;
7971
7972 /* se could be NULL for root_task_group */
7973 if (!se)
7974 return;
7975
7976 if (!parent)
7977 se->cfs_rq = &rq->cfs;
7978 else
7979 se->cfs_rq = parent->my_q;
7980
7981 se->my_q = cfs_rq;
7982 update_load_set(&se->load, 0);
7983 se->parent = parent;
7984}
7985#endif
7986
7987#ifdef CONFIG_RT_GROUP_SCHED
7988static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7989 struct sched_rt_entity *rt_se, int cpu,
7990 struct sched_rt_entity *parent)
7991{
7992 struct rq *rq = cpu_rq(cpu);
7993
7994 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7995 rt_rq->rt_nr_boosted = 0;
7996 rt_rq->rq = rq;
7997 rt_rq->tg = tg;
7998
7999 tg->rt_rq[cpu] = rt_rq;
8000 tg->rt_se[cpu] = rt_se;
8001
8002 if (!rt_se)
8003 return;
8004
8005 if (!parent)
8006 rt_se->rt_rq = &rq->rt;
8007 else
8008 rt_se->rt_rq = parent->my_q;
8009
8010 rt_se->my_q = rt_rq;
8011 rt_se->parent = parent;
8012 INIT_LIST_HEAD(&rt_se->run_list);
8013}
8014#endif
8015 6812
8016void __init sched_init(void) 6813void __init sched_init(void)
8017{ 6814{
@@ -8069,9 +6866,17 @@ void __init sched_init(void)
8069#ifdef CONFIG_CGROUP_SCHED 6866#ifdef CONFIG_CGROUP_SCHED
8070 list_add(&root_task_group.list, &task_groups); 6867 list_add(&root_task_group.list, &task_groups);
8071 INIT_LIST_HEAD(&root_task_group.children); 6868 INIT_LIST_HEAD(&root_task_group.children);
6869 INIT_LIST_HEAD(&root_task_group.siblings);
8072 autogroup_init(&init_task); 6870 autogroup_init(&init_task);
6871
8073#endif /* CONFIG_CGROUP_SCHED */ 6872#endif /* CONFIG_CGROUP_SCHED */
8074 6873
6874#ifdef CONFIG_CGROUP_CPUACCT
6875 root_cpuacct.cpustat = &kernel_cpustat;
6876 root_cpuacct.cpuusage = alloc_percpu(u64);
6877 /* Too early, not expected to fail */
6878 BUG_ON(!root_cpuacct.cpuusage);
6879#endif
8075 for_each_possible_cpu(i) { 6880 for_each_possible_cpu(i) {
8076 struct rq *rq; 6881 struct rq *rq;
8077 6882
@@ -8083,7 +6888,7 @@ void __init sched_init(void)
8083 init_cfs_rq(&rq->cfs); 6888 init_cfs_rq(&rq->cfs);
8084 init_rt_rq(&rq->rt, rq); 6889 init_rt_rq(&rq->rt, rq);
8085#ifdef CONFIG_FAIR_GROUP_SCHED 6890#ifdef CONFIG_FAIR_GROUP_SCHED
8086 root_task_group.shares = root_task_group_load; 6891 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8087 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6892 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8088 /* 6893 /*
8089 * How much cpu bandwidth does root_task_group get? 6894 * How much cpu bandwidth does root_task_group get?
@@ -8104,6 +6909,7 @@ void __init sched_init(void)
8104 * We achieve this by letting root_task_group's tasks sit 6909 * We achieve this by letting root_task_group's tasks sit
8105 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6910 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8106 */ 6911 */
6912 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8107 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6913 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8108#endif /* CONFIG_FAIR_GROUP_SCHED */ 6914#endif /* CONFIG_FAIR_GROUP_SCHED */
8109 6915
@@ -8132,8 +6938,7 @@ void __init sched_init(void)
8132 rq->avg_idle = 2*sysctl_sched_migration_cost; 6938 rq->avg_idle = 2*sysctl_sched_migration_cost;
8133 rq_attach_root(rq, &def_root_domain); 6939 rq_attach_root(rq, &def_root_domain);
8134#ifdef CONFIG_NO_HZ 6940#ifdef CONFIG_NO_HZ
8135 rq->nohz_balance_kick = 0; 6941 rq->nohz_flags = 0;
8136 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8137#endif 6942#endif
8138#endif 6943#endif
8139 init_rq_hrtick(rq); 6944 init_rq_hrtick(rq);
@@ -8146,10 +6951,6 @@ void __init sched_init(void)
8146 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6951 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8147#endif 6952#endif
8148 6953
8149#ifdef CONFIG_SMP
8150 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8151#endif
8152
8153#ifdef CONFIG_RT_MUTEXES 6954#ifdef CONFIG_RT_MUTEXES
8154 plist_head_init(&init_task.pi_waiters); 6955 plist_head_init(&init_task.pi_waiters);
8155#endif 6956#endif
@@ -8175,21 +6976,13 @@ void __init sched_init(void)
8175 */ 6976 */
8176 current->sched_class = &fair_sched_class; 6977 current->sched_class = &fair_sched_class;
8177 6978
8178 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8179 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8180#ifdef CONFIG_SMP 6979#ifdef CONFIG_SMP
8181 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6980 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8182#ifdef CONFIG_NO_HZ
8183 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8184 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8185 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8186 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8187 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8188#endif
8189 /* May be allocated at isolcpus cmdline parse time */ 6981 /* May be allocated at isolcpus cmdline parse time */
8190 if (cpu_isolated_map == NULL) 6982 if (cpu_isolated_map == NULL)
8191 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6983 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8192#endif /* SMP */ 6984#endif
6985 init_sched_fair_class();
8193 6986
8194 scheduler_running = 1; 6987 scheduler_running = 1;
8195} 6988}
@@ -8206,6 +6999,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8206{ 6999{
8207 static unsigned long prev_jiffy; /* ratelimiting */ 7000 static unsigned long prev_jiffy; /* ratelimiting */
8208 7001
7002 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
8209 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 7003 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8210 system_state != SYSTEM_RUNNING || oops_in_progress) 7004 system_state != SYSTEM_RUNNING || oops_in_progress)
8211 return; 7005 return;
@@ -8340,165 +7134,10 @@ void set_curr_task(int cpu, struct task_struct *p)
8340 7134
8341#endif 7135#endif
8342 7136
8343#ifdef CONFIG_FAIR_GROUP_SCHED
8344static void free_fair_sched_group(struct task_group *tg)
8345{
8346 int i;
8347
8348 for_each_possible_cpu(i) {
8349 if (tg->cfs_rq)
8350 kfree(tg->cfs_rq[i]);
8351 if (tg->se)
8352 kfree(tg->se[i]);
8353 }
8354
8355 kfree(tg->cfs_rq);
8356 kfree(tg->se);
8357}
8358
8359static
8360int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8361{
8362 struct cfs_rq *cfs_rq;
8363 struct sched_entity *se;
8364 int i;
8365
8366 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8367 if (!tg->cfs_rq)
8368 goto err;
8369 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8370 if (!tg->se)
8371 goto err;
8372
8373 tg->shares = NICE_0_LOAD;
8374
8375 for_each_possible_cpu(i) {
8376 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8377 GFP_KERNEL, cpu_to_node(i));
8378 if (!cfs_rq)
8379 goto err;
8380
8381 se = kzalloc_node(sizeof(struct sched_entity),
8382 GFP_KERNEL, cpu_to_node(i));
8383 if (!se)
8384 goto err_free_rq;
8385
8386 init_cfs_rq(cfs_rq);
8387 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8388 }
8389
8390 return 1;
8391
8392err_free_rq:
8393 kfree(cfs_rq);
8394err:
8395 return 0;
8396}
8397
8398static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8399{
8400 struct rq *rq = cpu_rq(cpu);
8401 unsigned long flags;
8402
8403 /*
8404 * Only empty task groups can be destroyed; so we can speculatively
8405 * check on_list without danger of it being re-added.
8406 */
8407 if (!tg->cfs_rq[cpu]->on_list)
8408 return;
8409
8410 raw_spin_lock_irqsave(&rq->lock, flags);
8411 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8412 raw_spin_unlock_irqrestore(&rq->lock, flags);
8413}
8414#else /* !CONFIG_FAIR_GROUP_SCHED */
8415static inline void free_fair_sched_group(struct task_group *tg)
8416{
8417}
8418
8419static inline
8420int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8421{
8422 return 1;
8423}
8424
8425static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8426{
8427}
8428#endif /* CONFIG_FAIR_GROUP_SCHED */
8429
8430#ifdef CONFIG_RT_GROUP_SCHED
8431static void free_rt_sched_group(struct task_group *tg)
8432{
8433 int i;
8434
8435 if (tg->rt_se)
8436 destroy_rt_bandwidth(&tg->rt_bandwidth);
8437
8438 for_each_possible_cpu(i) {
8439 if (tg->rt_rq)
8440 kfree(tg->rt_rq[i]);
8441 if (tg->rt_se)
8442 kfree(tg->rt_se[i]);
8443 }
8444
8445 kfree(tg->rt_rq);
8446 kfree(tg->rt_se);
8447}
8448
8449static
8450int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8451{
8452 struct rt_rq *rt_rq;
8453 struct sched_rt_entity *rt_se;
8454 int i;
8455
8456 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8457 if (!tg->rt_rq)
8458 goto err;
8459 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8460 if (!tg->rt_se)
8461 goto err;
8462
8463 init_rt_bandwidth(&tg->rt_bandwidth,
8464 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8465
8466 for_each_possible_cpu(i) {
8467 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8468 GFP_KERNEL, cpu_to_node(i));
8469 if (!rt_rq)
8470 goto err;
8471
8472 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8473 GFP_KERNEL, cpu_to_node(i));
8474 if (!rt_se)
8475 goto err_free_rq;
8476
8477 init_rt_rq(rt_rq, cpu_rq(i));
8478 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8479 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8480 }
8481
8482 return 1;
8483
8484err_free_rq:
8485 kfree(rt_rq);
8486err:
8487 return 0;
8488}
8489#else /* !CONFIG_RT_GROUP_SCHED */
8490static inline void free_rt_sched_group(struct task_group *tg)
8491{
8492}
8493
8494static inline
8495int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8496{
8497 return 1;
8498}
8499#endif /* CONFIG_RT_GROUP_SCHED */
8500
8501#ifdef CONFIG_CGROUP_SCHED 7137#ifdef CONFIG_CGROUP_SCHED
7138/* task_group_lock serializes the addition/removal of task groups */
7139static DEFINE_SPINLOCK(task_group_lock);
7140
8502static void free_sched_group(struct task_group *tg) 7141static void free_sched_group(struct task_group *tg)
8503{ 7142{
8504 free_fair_sched_group(tg); 7143 free_fair_sched_group(tg);
@@ -8603,47 +7242,13 @@ void sched_move_task(struct task_struct *tsk)
8603} 7242}
8604#endif /* CONFIG_CGROUP_SCHED */ 7243#endif /* CONFIG_CGROUP_SCHED */
8605 7244
8606#ifdef CONFIG_FAIR_GROUP_SCHED 7245#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8607static DEFINE_MUTEX(shares_mutex); 7246static unsigned long to_ratio(u64 period, u64 runtime)
8608
8609int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8610{ 7247{
8611 int i; 7248 if (runtime == RUNTIME_INF)
8612 unsigned long flags; 7249 return 1ULL << 20;
8613
8614 /*
8615 * We can't change the weight of the root cgroup.
8616 */
8617 if (!tg->se[0])
8618 return -EINVAL;
8619
8620 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8621
8622 mutex_lock(&shares_mutex);
8623 if (tg->shares == shares)
8624 goto done;
8625
8626 tg->shares = shares;
8627 for_each_possible_cpu(i) {
8628 struct rq *rq = cpu_rq(i);
8629 struct sched_entity *se;
8630
8631 se = tg->se[i];
8632 /* Propagate contribution to hierarchy */
8633 raw_spin_lock_irqsave(&rq->lock, flags);
8634 for_each_sched_entity(se)
8635 update_cfs_shares(group_cfs_rq(se));
8636 raw_spin_unlock_irqrestore(&rq->lock, flags);
8637 }
8638
8639done:
8640 mutex_unlock(&shares_mutex);
8641 return 0;
8642}
8643 7250
8644unsigned long sched_group_shares(struct task_group *tg) 7251 return div64_u64(runtime << 20, period);
8645{
8646 return tg->shares;
8647} 7252}
8648#endif 7253#endif
8649 7254
@@ -8653,21 +7258,13 @@ unsigned long sched_group_shares(struct task_group *tg)
8653 */ 7258 */
8654static DEFINE_MUTEX(rt_constraints_mutex); 7259static DEFINE_MUTEX(rt_constraints_mutex);
8655 7260
8656static unsigned long to_ratio(u64 period, u64 runtime)
8657{
8658 if (runtime == RUNTIME_INF)
8659 return 1ULL << 20;
8660
8661 return div64_u64(runtime << 20, period);
8662}
8663
8664/* Must be called with tasklist_lock held */ 7261/* Must be called with tasklist_lock held */
8665static inline int tg_has_rt_tasks(struct task_group *tg) 7262static inline int tg_has_rt_tasks(struct task_group *tg)
8666{ 7263{
8667 struct task_struct *g, *p; 7264 struct task_struct *g, *p;
8668 7265
8669 do_each_thread(g, p) { 7266 do_each_thread(g, p) {
8670 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7267 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8671 return 1; 7268 return 1;
8672 } while_each_thread(g, p); 7269 } while_each_thread(g, p);
8673 7270
@@ -8680,7 +7277,7 @@ struct rt_schedulable_data {
8680 u64 rt_runtime; 7277 u64 rt_runtime;
8681}; 7278};
8682 7279
8683static int tg_schedulable(struct task_group *tg, void *data) 7280static int tg_rt_schedulable(struct task_group *tg, void *data)
8684{ 7281{
8685 struct rt_schedulable_data *d = data; 7282 struct rt_schedulable_data *d = data;
8686 struct task_group *child; 7283 struct task_group *child;
@@ -8738,16 +7335,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8738 7335
8739static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7336static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8740{ 7337{
7338 int ret;
7339
8741 struct rt_schedulable_data data = { 7340 struct rt_schedulable_data data = {
8742 .tg = tg, 7341 .tg = tg,
8743 .rt_period = period, 7342 .rt_period = period,
8744 .rt_runtime = runtime, 7343 .rt_runtime = runtime,
8745 }; 7344 };
8746 7345
8747 return walk_tg_tree(tg_schedulable, tg_nop, &data); 7346 rcu_read_lock();
7347 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7348 rcu_read_unlock();
7349
7350 return ret;
8748} 7351}
8749 7352
8750static int tg_set_bandwidth(struct task_group *tg, 7353static int tg_set_rt_bandwidth(struct task_group *tg,
8751 u64 rt_period, u64 rt_runtime) 7354 u64 rt_period, u64 rt_runtime)
8752{ 7355{
8753 int i, err = 0; 7356 int i, err = 0;
@@ -8786,7 +7389,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8786 if (rt_runtime_us < 0) 7389 if (rt_runtime_us < 0)
8787 rt_runtime = RUNTIME_INF; 7390 rt_runtime = RUNTIME_INF;
8788 7391
8789 return tg_set_bandwidth(tg, rt_period, rt_runtime); 7392 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8790} 7393}
8791 7394
8792long sched_group_rt_runtime(struct task_group *tg) 7395long sched_group_rt_runtime(struct task_group *tg)
@@ -8811,7 +7414,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8811 if (rt_period == 0) 7414 if (rt_period == 0)
8812 return -EINVAL; 7415 return -EINVAL;
8813 7416
8814 return tg_set_bandwidth(tg, rt_period, rt_runtime); 7417 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8815} 7418}
8816 7419
8817long sched_group_rt_period(struct task_group *tg) 7420long sched_group_rt_period(struct task_group *tg)
@@ -8953,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8953 sched_destroy_group(tg); 7556 sched_destroy_group(tg);
8954} 7557}
8955 7558
8956static int 7559static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8957cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7560 struct cgroup_taskset *tset)
8958{ 7561{
7562 struct task_struct *task;
7563
7564 cgroup_taskset_for_each(task, cgrp, tset) {
8959#ifdef CONFIG_RT_GROUP_SCHED 7565#ifdef CONFIG_RT_GROUP_SCHED
8960 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 7566 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
8961 return -EINVAL; 7567 return -EINVAL;
8962#else 7568#else
8963 /* We don't support RT-tasks being in separate groups */ 7569 /* We don't support RT-tasks being in separate groups */
8964 if (tsk->sched_class != &fair_sched_class) 7570 if (task->sched_class != &fair_sched_class)
8965 return -EINVAL; 7571 return -EINVAL;
8966#endif 7572#endif
7573 }
8967 return 0; 7574 return 0;
8968} 7575}
8969 7576
8970static void 7577static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8971cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7578 struct cgroup_taskset *tset)
8972{ 7579{
8973 sched_move_task(tsk); 7580 struct task_struct *task;
7581
7582 cgroup_taskset_for_each(task, cgrp, tset)
7583 sched_move_task(task);
8974} 7584}
8975 7585
8976static void 7586static void
@@ -9001,6 +7611,237 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9001 7611
9002 return (u64) scale_load_down(tg->shares); 7612 return (u64) scale_load_down(tg->shares);
9003} 7613}
7614
7615#ifdef CONFIG_CFS_BANDWIDTH
7616static DEFINE_MUTEX(cfs_constraints_mutex);
7617
7618const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7619const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7620
7621static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7622
7623static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7624{
7625 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7626 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7627
7628 if (tg == &root_task_group)
7629 return -EINVAL;
7630
7631 /*
7632 * Ensure we have at some amount of bandwidth every period. This is
7633 * to prevent reaching a state of large arrears when throttled via
7634 * entity_tick() resulting in prolonged exit starvation.
7635 */
7636 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7637 return -EINVAL;
7638
7639 /*
7640 * Likewise, bound things on the otherside by preventing insane quota
7641 * periods. This also allows us to normalize in computing quota
7642 * feasibility.
7643 */
7644 if (period > max_cfs_quota_period)
7645 return -EINVAL;
7646
7647 mutex_lock(&cfs_constraints_mutex);
7648 ret = __cfs_schedulable(tg, period, quota);
7649 if (ret)
7650 goto out_unlock;
7651
7652 runtime_enabled = quota != RUNTIME_INF;
7653 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7654 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7655 raw_spin_lock_irq(&cfs_b->lock);
7656 cfs_b->period = ns_to_ktime(period);
7657 cfs_b->quota = quota;
7658
7659 __refill_cfs_bandwidth_runtime(cfs_b);
7660 /* restart the period timer (if active) to handle new period expiry */
7661 if (runtime_enabled && cfs_b->timer_active) {
7662 /* force a reprogram */
7663 cfs_b->timer_active = 0;
7664 __start_cfs_bandwidth(cfs_b);
7665 }
7666 raw_spin_unlock_irq(&cfs_b->lock);
7667
7668 for_each_possible_cpu(i) {
7669 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7670 struct rq *rq = cfs_rq->rq;
7671
7672 raw_spin_lock_irq(&rq->lock);
7673 cfs_rq->runtime_enabled = runtime_enabled;
7674 cfs_rq->runtime_remaining = 0;
7675
7676 if (cfs_rq->throttled)
7677 unthrottle_cfs_rq(cfs_rq);
7678 raw_spin_unlock_irq(&rq->lock);
7679 }
7680out_unlock:
7681 mutex_unlock(&cfs_constraints_mutex);
7682
7683 return ret;
7684}
7685
7686int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7687{
7688 u64 quota, period;
7689
7690 period = ktime_to_ns(tg->cfs_bandwidth.period);
7691 if (cfs_quota_us < 0)
7692 quota = RUNTIME_INF;
7693 else
7694 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7695
7696 return tg_set_cfs_bandwidth(tg, period, quota);
7697}
7698
7699long tg_get_cfs_quota(struct task_group *tg)
7700{
7701 u64 quota_us;
7702
7703 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7704 return -1;
7705
7706 quota_us = tg->cfs_bandwidth.quota;
7707 do_div(quota_us, NSEC_PER_USEC);
7708
7709 return quota_us;
7710}
7711
7712int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7713{
7714 u64 quota, period;
7715
7716 period = (u64)cfs_period_us * NSEC_PER_USEC;
7717 quota = tg->cfs_bandwidth.quota;
7718
7719 return tg_set_cfs_bandwidth(tg, period, quota);
7720}
7721
7722long tg_get_cfs_period(struct task_group *tg)
7723{
7724 u64 cfs_period_us;
7725
7726 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7727 do_div(cfs_period_us, NSEC_PER_USEC);
7728
7729 return cfs_period_us;
7730}
7731
7732static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7733{
7734 return tg_get_cfs_quota(cgroup_tg(cgrp));
7735}
7736
7737static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7738 s64 cfs_quota_us)
7739{
7740 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7741}
7742
7743static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7744{
7745 return tg_get_cfs_period(cgroup_tg(cgrp));
7746}
7747
7748static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7749 u64 cfs_period_us)
7750{
7751 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7752}
7753
7754struct cfs_schedulable_data {
7755 struct task_group *tg;
7756 u64 period, quota;
7757};
7758
7759/*
7760 * normalize group quota/period to be quota/max_period
7761 * note: units are usecs
7762 */
7763static u64 normalize_cfs_quota(struct task_group *tg,
7764 struct cfs_schedulable_data *d)
7765{
7766 u64 quota, period;
7767
7768 if (tg == d->tg) {
7769 period = d->period;
7770 quota = d->quota;
7771 } else {
7772 period = tg_get_cfs_period(tg);
7773 quota = tg_get_cfs_quota(tg);
7774 }
7775
7776 /* note: these should typically be equivalent */
7777 if (quota == RUNTIME_INF || quota == -1)
7778 return RUNTIME_INF;
7779
7780 return to_ratio(period, quota);
7781}
7782
7783static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7784{
7785 struct cfs_schedulable_data *d = data;
7786 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7787 s64 quota = 0, parent_quota = -1;
7788
7789 if (!tg->parent) {
7790 quota = RUNTIME_INF;
7791 } else {
7792 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7793
7794 quota = normalize_cfs_quota(tg, d);
7795 parent_quota = parent_b->hierarchal_quota;
7796
7797 /*
7798 * ensure max(child_quota) <= parent_quota, inherit when no
7799 * limit is set
7800 */
7801 if (quota == RUNTIME_INF)
7802 quota = parent_quota;
7803 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7804 return -EINVAL;
7805 }
7806 cfs_b->hierarchal_quota = quota;
7807
7808 return 0;
7809}
7810
7811static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7812{
7813 int ret;
7814 struct cfs_schedulable_data data = {
7815 .tg = tg,
7816 .period = period,
7817 .quota = quota,
7818 };
7819
7820 if (quota != RUNTIME_INF) {
7821 do_div(data.period, NSEC_PER_USEC);
7822 do_div(data.quota, NSEC_PER_USEC);
7823 }
7824
7825 rcu_read_lock();
7826 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7827 rcu_read_unlock();
7828
7829 return ret;
7830}
7831
7832static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7833 struct cgroup_map_cb *cb)
7834{
7835 struct task_group *tg = cgroup_tg(cgrp);
7836 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7837
7838 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7839 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7840 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7841
7842 return 0;
7843}
7844#endif /* CONFIG_CFS_BANDWIDTH */
9004#endif /* CONFIG_FAIR_GROUP_SCHED */ 7845#endif /* CONFIG_FAIR_GROUP_SCHED */
9005 7846
9006#ifdef CONFIG_RT_GROUP_SCHED 7847#ifdef CONFIG_RT_GROUP_SCHED
@@ -9035,6 +7876,22 @@ static struct cftype cpu_files[] = {
9035 .write_u64 = cpu_shares_write_u64, 7876 .write_u64 = cpu_shares_write_u64,
9036 }, 7877 },
9037#endif 7878#endif
7879#ifdef CONFIG_CFS_BANDWIDTH
7880 {
7881 .name = "cfs_quota_us",
7882 .read_s64 = cpu_cfs_quota_read_s64,
7883 .write_s64 = cpu_cfs_quota_write_s64,
7884 },
7885 {
7886 .name = "cfs_period_us",
7887 .read_u64 = cpu_cfs_period_read_u64,
7888 .write_u64 = cpu_cfs_period_write_u64,
7889 },
7890 {
7891 .name = "stat",
7892 .read_map = cpu_stats_show,
7893 },
7894#endif
9038#ifdef CONFIG_RT_GROUP_SCHED 7895#ifdef CONFIG_RT_GROUP_SCHED
9039 { 7896 {
9040 .name = "rt_runtime_us", 7897 .name = "rt_runtime_us",
@@ -9058,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9058 .name = "cpu", 7915 .name = "cpu",
9059 .create = cpu_cgroup_create, 7916 .create = cpu_cgroup_create,
9060 .destroy = cpu_cgroup_destroy, 7917 .destroy = cpu_cgroup_destroy,
9061 .can_attach_task = cpu_cgroup_can_attach_task, 7918 .can_attach = cpu_cgroup_can_attach,
9062 .attach_task = cpu_cgroup_attach_task, 7919 .attach = cpu_cgroup_attach,
9063 .exit = cpu_cgroup_exit, 7920 .exit = cpu_cgroup_exit,
9064 .populate = cpu_cgroup_populate, 7921 .populate = cpu_cgroup_populate,
9065 .subsys_id = cpu_cgroup_subsys_id, 7922 .subsys_id = cpu_cgroup_subsys_id,
@@ -9077,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9077 * (balbir@in.ibm.com). 7934 * (balbir@in.ibm.com).
9078 */ 7935 */
9079 7936
9080/* track cpu usage of a group of tasks and its child groups */
9081struct cpuacct {
9082 struct cgroup_subsys_state css;
9083 /* cpuusage holds pointer to a u64-type object on every cpu */
9084 u64 __percpu *cpuusage;
9085 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9086 struct cpuacct *parent;
9087};
9088
9089struct cgroup_subsys cpuacct_subsys;
9090
9091/* return cpu accounting group corresponding to this container */
9092static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9093{
9094 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9095 struct cpuacct, css);
9096}
9097
9098/* return cpu accounting group to which this task belongs */
9099static inline struct cpuacct *task_ca(struct task_struct *tsk)
9100{
9101 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9102 struct cpuacct, css);
9103}
9104
9105/* create a new cpu accounting group */ 7937/* create a new cpu accounting group */
9106static struct cgroup_subsys_state *cpuacct_create( 7938static struct cgroup_subsys_state *cpuacct_create(
9107 struct cgroup_subsys *ss, struct cgroup *cgrp) 7939 struct cgroup_subsys *ss, struct cgroup *cgrp)
9108{ 7940{
9109 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7941 struct cpuacct *ca;
9110 int i; 7942
7943 if (!cgrp->parent)
7944 return &root_cpuacct.css;
9111 7945
7946 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9112 if (!ca) 7947 if (!ca)
9113 goto out; 7948 goto out;
9114 7949
@@ -9116,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9116 if (!ca->cpuusage) 7951 if (!ca->cpuusage)
9117 goto out_free_ca; 7952 goto out_free_ca;
9118 7953
9119 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7954 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9120 if (percpu_counter_init(&ca->cpustat[i], 0)) 7955 if (!ca->cpustat)
9121 goto out_free_counters; 7956 goto out_free_cpuusage;
9122
9123 if (cgrp->parent)
9124 ca->parent = cgroup_ca(cgrp->parent);
9125 7957
9126 return &ca->css; 7958 return &ca->css;
9127 7959
9128out_free_counters: 7960out_free_cpuusage:
9129 while (--i >= 0)
9130 percpu_counter_destroy(&ca->cpustat[i]);
9131 free_percpu(ca->cpuusage); 7961 free_percpu(ca->cpuusage);
9132out_free_ca: 7962out_free_ca:
9133 kfree(ca); 7963 kfree(ca);
@@ -9140,10 +7970,8 @@ static void
9140cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7970cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9141{ 7971{
9142 struct cpuacct *ca = cgroup_ca(cgrp); 7972 struct cpuacct *ca = cgroup_ca(cgrp);
9143 int i;
9144 7973
9145 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7974 free_percpu(ca->cpustat);
9146 percpu_counter_destroy(&ca->cpustat[i]);
9147 free_percpu(ca->cpuusage); 7975 free_percpu(ca->cpuusage);
9148 kfree(ca); 7976 kfree(ca);
9149} 7977}
@@ -9236,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = {
9236}; 8064};
9237 8065
9238static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8066static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9239 struct cgroup_map_cb *cb) 8067 struct cgroup_map_cb *cb)
9240{ 8068{
9241 struct cpuacct *ca = cgroup_ca(cgrp); 8069 struct cpuacct *ca = cgroup_ca(cgrp);
9242 int i; 8070 int cpu;
8071 s64 val = 0;
8072
8073 for_each_online_cpu(cpu) {
8074 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8075 val += kcpustat->cpustat[CPUTIME_USER];
8076 val += kcpustat->cpustat[CPUTIME_NICE];
8077 }
8078 val = cputime64_to_clock_t(val);
8079 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
9243 8080
9244 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8081 val = 0;
9245 s64 val = percpu_counter_read(&ca->cpustat[i]); 8082 for_each_online_cpu(cpu) {
9246 val = cputime64_to_clock_t(val); 8083 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9247 cb->fill(cb, cpuacct_stat_desc[i], val); 8084 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8085 val += kcpustat->cpustat[CPUTIME_IRQ];
8086 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
9248 } 8087 }
8088
8089 val = cputime64_to_clock_t(val);
8090 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8091
9249 return 0; 8092 return 0;
9250} 8093}
9251 8094
@@ -9275,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9275 * 8118 *
9276 * called with rq->lock held. 8119 * called with rq->lock held.
9277 */ 8120 */
9278static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8121void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9279{ 8122{
9280 struct cpuacct *ca; 8123 struct cpuacct *ca;
9281 int cpu; 8124 int cpu;
@@ -9289,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9289 8132
9290 ca = task_ca(tsk); 8133 ca = task_ca(tsk);
9291 8134
9292 for (; ca; ca = ca->parent) { 8135 for (; ca; ca = parent_ca(ca)) {
9293 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8136 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9294 *cpuusage += cputime; 8137 *cpuusage += cputime;
9295 } 8138 }
@@ -9297,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9297 rcu_read_unlock(); 8140 rcu_read_unlock();
9298} 8141}
9299 8142
9300/*
9301 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9302 * in cputime_t units. As a result, cpuacct_update_stats calls
9303 * percpu_counter_add with values large enough to always overflow the
9304 * per cpu batch limit causing bad SMP scalability.
9305 *
9306 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9307 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9308 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9309 */
9310#ifdef CONFIG_SMP
9311#define CPUACCT_BATCH \
9312 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9313#else
9314#define CPUACCT_BATCH 0
9315#endif
9316
9317/*
9318 * Charge the system/user time to the task's accounting group.
9319 */
9320static void cpuacct_update_stats(struct task_struct *tsk,
9321 enum cpuacct_stat_index idx, cputime_t val)
9322{
9323 struct cpuacct *ca;
9324 int batch = CPUACCT_BATCH;
9325
9326 if (unlikely(!cpuacct_subsys.active))
9327 return;
9328
9329 rcu_read_lock();
9330 ca = task_ca(tsk);
9331
9332 do {
9333 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9334 ca = ca->parent;
9335 } while (ca);
9336 rcu_read_unlock();
9337}
9338
9339struct cgroup_subsys cpuacct_subsys = { 8143struct cgroup_subsys cpuacct_subsys = {
9340 .name = "cpuacct", 8144 .name = "cpuacct",
9341 .create = cpuacct_create, 8145 .create = cpuacct_create,
@@ -9344,4 +8148,3 @@ struct cgroup_subsys cpuacct_subsys = {
9344 .subsys_id = cpuacct_subsys_id, 8148 .subsys_id = cpuacct_subsys_id,
9345}; 8149};
9346#endif /* CONFIG_CGROUP_CPUACCT */ 8150#endif /* CONFIG_CGROUP_CPUACCT */
9347
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index 2722dc1b4138..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index bc8ee9993814..84adb2d66cbd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -89,7 +96,124 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 96 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 98
92static const struct sched_class fair_sched_class; 99#ifdef CONFIG_CFS_BANDWIDTH
100/*
101 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
102 * each time a cfs_rq requests quota.
103 *
104 * Note: in the case that the slice exceeds the runtime remaining (either due
105 * to consumption or the quota being specified to be smaller than the slice)
106 * we will always only issue the remaining available time.
107 *
108 * default: 5 msec, units: microseconds
109 */
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111#endif
112
113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
93 217
94/************************************************************** 218/**************************************************************
95 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -292,6 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 416
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 417#endif /* CONFIG_FAIR_GROUP_SCHED */
294 418
419static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
420 unsigned long delta_exec);
295 421
296/************************************************************** 422/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 423 * Scheduling class tree data structure manipulation methods:
@@ -397,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
397 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
398} 524}
399 525
400static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
401{ 527{
402 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
403 529
@@ -418,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
418} 544}
419 545
420#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
421static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
422{ 548{
423 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
424 550
@@ -583,6 +709,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 709 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 710 account_group_exec_runtime(curtask, delta_exec);
585 } 711 }
712
713 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 714}
587 715
588static inline void 716static inline void
@@ -666,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
666{ 794{
667 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
668 if (!parent_entity(se)) 796 if (!parent_entity(se))
669 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
670 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
671 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
672 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -679,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
679{ 807{
680 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
681 if (!parent_entity(se)) 809 if (!parent_entity(se))
682 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
683 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
684 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
685 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -688,6 +816,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 816}
689 817
690#ifdef CONFIG_FAIR_GROUP_SCHED 818#ifdef CONFIG_FAIR_GROUP_SCHED
819/* we need this in update_cfs_load and load-balance functions below */
820static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 821# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 822static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 823 int global_update)
@@ -710,7 +840,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 840 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 841 unsigned long load = cfs_rq->load.weight;
712 842
713 if (cfs_rq->tg == &root_task_group) 843 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 844 return;
715 845
716 now = rq_of(cfs_rq)->clock_task; 846 now = rq_of(cfs_rq)->clock_task;
@@ -752,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
752 list_del_leaf_cfs_rq(cfs_rq); 882 list_del_leaf_cfs_rq(cfs_rq);
753} 883}
754 884
885static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
886{
887 long tg_weight;
888
889 /*
890 * Use this CPU's actual weight instead of the last load_contribution
891 * to gain a more accurate current total weight. See
892 * update_cfs_rq_load_contribution().
893 */
894 tg_weight = atomic_read(&tg->load_weight);
895 tg_weight -= cfs_rq->load_contribution;
896 tg_weight += cfs_rq->load.weight;
897
898 return tg_weight;
899}
900
755static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 901static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
756{ 902{
757 long load_weight, load, shares; 903 long tg_weight, load, shares;
758 904
905 tg_weight = calc_tg_weight(tg, cfs_rq);
759 load = cfs_rq->load.weight; 906 load = cfs_rq->load.weight;
760 907
761 load_weight = atomic_read(&tg->load_weight);
762 load_weight += load;
763 load_weight -= cfs_rq->load_contribution;
764
765 shares = (tg->shares * load); 908 shares = (tg->shares * load);
766 if (load_weight) 909 if (tg_weight)
767 shares /= load_weight; 910 shares /= tg_weight;
768 911
769 if (shares < MIN_SHARES) 912 if (shares < MIN_SHARES)
770 shares = MIN_SHARES; 913 shares = MIN_SHARES;
@@ -819,7 +962,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 962
820 tg = cfs_rq->tg; 963 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 964 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 965 if (!se || throttled_hierarchy(cfs_rq))
823 return; 966 return;
824#ifndef CONFIG_SMP 967#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 968 if (likely(se->load.weight == tg->shares))
@@ -860,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
860 if (unlikely(delta > se->statistics.sleep_max)) 1003 if (unlikely(delta > se->statistics.sleep_max))
861 se->statistics.sleep_max = delta; 1004 se->statistics.sleep_max = delta;
862 1005
863 se->statistics.sleep_start = 0;
864 se->statistics.sum_sleep_runtime += delta; 1006 se->statistics.sum_sleep_runtime += delta;
865 1007
866 if (tsk) { 1008 if (tsk) {
@@ -877,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
877 if (unlikely(delta > se->statistics.block_max)) 1019 if (unlikely(delta > se->statistics.block_max))
878 se->statistics.block_max = delta; 1020 se->statistics.block_max = delta;
879 1021
880 se->statistics.block_start = 0;
881 se->statistics.sum_sleep_runtime += delta; 1022 se->statistics.sum_sleep_runtime += delta;
882 1023
883 if (tsk) { 1024 if (tsk) {
@@ -887,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
887 trace_sched_stat_iowait(tsk, delta); 1028 trace_sched_stat_iowait(tsk, delta);
888 } 1029 }
889 1030
1031 trace_sched_stat_blocked(tsk, delta);
1032
890 /* 1033 /*
891 * Blocking time is in units of nanosecs, so shift by 1034 * Blocking time is in units of nanosecs, so shift by
892 * 20 to get a milliseconds-range estimation of the 1035 * 20 to get a milliseconds-range estimation of the
@@ -950,6 +1093,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 1093 se->vruntime = vruntime;
951} 1094}
952 1095
1096static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1097
953static void 1098static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1099enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 1100{
@@ -979,8 +1124,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1124 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1125 se->on_rq = 1;
981 1126
982 if (cfs_rq->nr_running == 1) 1127 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1128 list_add_leaf_cfs_rq(cfs_rq);
1129 check_enqueue_throttle(cfs_rq);
1130 }
984} 1131}
985 1132
986static void __clear_buddies_last(struct sched_entity *se) 1133static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1175,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1175 __clear_buddies_skip(se);
1029} 1176}
1030 1177
1178static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1179
1031static void 1180static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1181dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1182{
@@ -1066,6 +1215,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1215 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1216 se->vruntime -= cfs_rq->min_vruntime;
1068 1217
1218 /* return excess runtime on last dequeue */
1219 return_cfs_rq_runtime(cfs_rq);
1220
1069 update_min_vruntime(cfs_rq); 1221 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1222 update_cfs_shares(cfs_rq);
1071} 1223}
@@ -1077,6 +1229,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1229check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1230{
1079 unsigned long ideal_runtime, delta_exec; 1231 unsigned long ideal_runtime, delta_exec;
1232 struct sched_entity *se;
1233 s64 delta;
1080 1234
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1235 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1236 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1249,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1249 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1250 * This also mitigates buddy induced latencies under load.
1097 */ 1251 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1252 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1253 return;
1103 1254
1104 if (cfs_rq->nr_running > 1) { 1255 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1256 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1257
1108 if (delta < 0) 1258 if (delta < 0)
1109 return; 1259 return;
1110 1260
1111 if (delta > ideal_runtime) 1261 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1262 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1263}
1115 1264
1116static void 1265static void
@@ -1185,6 +1334,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1334 return se;
1186} 1335}
1187 1336
1337static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1338
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1339static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1340{
1190 /* 1341 /*
@@ -1194,6 +1345,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1345 if (prev->on_rq)
1195 update_curr(cfs_rq); 1346 update_curr(cfs_rq);
1196 1347
1348 /* throttle cfs_rqs exceeding runtime */
1349 check_cfs_rq_runtime(cfs_rq);
1350
1197 check_spread(cfs_rq, prev); 1351 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1352 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1353 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1387,742 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1387 return;
1234#endif 1388#endif
1235 1389
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1390 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1391 check_preempt_tick(cfs_rq, curr);
1238} 1392}
1239 1393
1394
1395/**************************************************
1396 * CFS bandwidth control machinery
1397 */
1398
1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1426/*
1427 * default period for cfs group bandwidth.
1428 * default: 0.1s, units: nanoseconds
1429 */
1430static inline u64 default_cfs_period(void)
1431{
1432 return 100000000ULL;
1433}
1434
1435static inline u64 sched_cfs_bandwidth_slice(void)
1436{
1437 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1438}
1439
1440/*
1441 * Replenish runtime according to assigned quota and update expiration time.
1442 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1443 * additional synchronization around rq->lock.
1444 *
1445 * requires cfs_b->lock
1446 */
1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1448{
1449 u64 now;
1450
1451 if (cfs_b->quota == RUNTIME_INF)
1452 return;
1453
1454 now = sched_clock_cpu(smp_processor_id());
1455 cfs_b->runtime = cfs_b->quota;
1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1457}
1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1464/* returns 0 on failure to allocate runtime */
1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1466{
1467 struct task_group *tg = cfs_rq->tg;
1468 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1469 u64 amount = 0, min_amount, expires;
1470
1471 /* note: this is a positive sum as runtime_remaining <= 0 */
1472 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1473
1474 raw_spin_lock(&cfs_b->lock);
1475 if (cfs_b->quota == RUNTIME_INF)
1476 amount = min_amount;
1477 else {
1478 /*
1479 * If the bandwidth pool has become inactive, then at least one
1480 * period must have elapsed since the last consumption.
1481 * Refresh the global state and ensure bandwidth timer becomes
1482 * active.
1483 */
1484 if (!cfs_b->timer_active) {
1485 __refill_cfs_bandwidth_runtime(cfs_b);
1486 __start_cfs_bandwidth(cfs_b);
1487 }
1488
1489 if (cfs_b->runtime > 0) {
1490 amount = min(cfs_b->runtime, min_amount);
1491 cfs_b->runtime -= amount;
1492 cfs_b->idle = 0;
1493 }
1494 }
1495 expires = cfs_b->runtime_expires;
1496 raw_spin_unlock(&cfs_b->lock);
1497
1498 cfs_rq->runtime_remaining += amount;
1499 /*
1500 * we may have advanced our local expiration to account for allowed
1501 * spread between our sched_clock and the one on which runtime was
1502 * issued.
1503 */
1504 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1505 cfs_rq->runtime_expires = expires;
1506
1507 return cfs_rq->runtime_remaining > 0;
1508}
1509
1510/*
1511 * Note: This depends on the synchronization provided by sched_clock and the
1512 * fact that rq->clock snapshots this value.
1513 */
1514static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1515{
1516 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1517 struct rq *rq = rq_of(cfs_rq);
1518
1519 /* if the deadline is ahead of our clock, nothing to do */
1520 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1521 return;
1522
1523 if (cfs_rq->runtime_remaining < 0)
1524 return;
1525
1526 /*
1527 * If the local deadline has passed we have to consider the
1528 * possibility that our sched_clock is 'fast' and the global deadline
1529 * has not truly expired.
1530 *
1531 * Fortunately we can check determine whether this the case by checking
1532 * whether the global deadline has advanced.
1533 */
1534
1535 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1536 /* extend local deadline, drift is bounded above by 2 ticks */
1537 cfs_rq->runtime_expires += TICK_NSEC;
1538 } else {
1539 /* global deadline is ahead, expiration has passed */
1540 cfs_rq->runtime_remaining = 0;
1541 }
1542}
1543
1544static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1545 unsigned long delta_exec)
1546{
1547 /* dock delta_exec before expiring quota (as it could span periods) */
1548 cfs_rq->runtime_remaining -= delta_exec;
1549 expire_cfs_rq_runtime(cfs_rq);
1550
1551 if (likely(cfs_rq->runtime_remaining > 0))
1552 return;
1553
1554 /*
1555 * if we're unable to extend our runtime we resched so that the active
1556 * hierarchy can be throttled
1557 */
1558 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1559 resched_task(rq_of(cfs_rq)->curr);
1560}
1561
1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1563 unsigned long delta_exec)
1564{
1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1566 return;
1567
1568 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1569}
1570
1571static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1572{
1573 return cfs_bandwidth_used() && cfs_rq->throttled;
1574}
1575
1576/* check whether cfs_rq, or any parent, is throttled */
1577static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1578{
1579 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1580}
1581
1582/*
1583 * Ensure that neither of the group entities corresponding to src_cpu or
1584 * dest_cpu are members of a throttled hierarchy when performing group
1585 * load-balance operations.
1586 */
1587static inline int throttled_lb_pair(struct task_group *tg,
1588 int src_cpu, int dest_cpu)
1589{
1590 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1591
1592 src_cfs_rq = tg->cfs_rq[src_cpu];
1593 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1594
1595 return throttled_hierarchy(src_cfs_rq) ||
1596 throttled_hierarchy(dest_cfs_rq);
1597}
1598
1599/* updated child weight may affect parent so we have to do this bottom up */
1600static int tg_unthrottle_up(struct task_group *tg, void *data)
1601{
1602 struct rq *rq = data;
1603 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1604
1605 cfs_rq->throttle_count--;
1606#ifdef CONFIG_SMP
1607 if (!cfs_rq->throttle_count) {
1608 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1609
1610 /* leaving throttled state, advance shares averaging windows */
1611 cfs_rq->load_stamp += delta;
1612 cfs_rq->load_last += delta;
1613
1614 /* update entity weight now that we are on_rq again */
1615 update_cfs_shares(cfs_rq);
1616 }
1617#endif
1618
1619 return 0;
1620}
1621
1622static int tg_throttle_down(struct task_group *tg, void *data)
1623{
1624 struct rq *rq = data;
1625 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1626
1627 /* group is entering throttled state, record last load */
1628 if (!cfs_rq->throttle_count)
1629 update_cfs_load(cfs_rq, 0);
1630 cfs_rq->throttle_count++;
1631
1632 return 0;
1633}
1634
1635static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1636{
1637 struct rq *rq = rq_of(cfs_rq);
1638 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1639 struct sched_entity *se;
1640 long task_delta, dequeue = 1;
1641
1642 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1643
1644 /* account load preceding throttle */
1645 rcu_read_lock();
1646 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1647 rcu_read_unlock();
1648
1649 task_delta = cfs_rq->h_nr_running;
1650 for_each_sched_entity(se) {
1651 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1652 /* throttled entity or throttle-on-deactivate */
1653 if (!se->on_rq)
1654 break;
1655
1656 if (dequeue)
1657 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1658 qcfs_rq->h_nr_running -= task_delta;
1659
1660 if (qcfs_rq->load.weight)
1661 dequeue = 0;
1662 }
1663
1664 if (!se)
1665 rq->nr_running -= task_delta;
1666
1667 cfs_rq->throttled = 1;
1668 cfs_rq->throttled_timestamp = rq->clock;
1669 raw_spin_lock(&cfs_b->lock);
1670 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1671 raw_spin_unlock(&cfs_b->lock);
1672}
1673
1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1675{
1676 struct rq *rq = rq_of(cfs_rq);
1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1678 struct sched_entity *se;
1679 int enqueue = 1;
1680 long task_delta;
1681
1682 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1683
1684 cfs_rq->throttled = 0;
1685 raw_spin_lock(&cfs_b->lock);
1686 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1687 list_del_rcu(&cfs_rq->throttled_list);
1688 raw_spin_unlock(&cfs_b->lock);
1689 cfs_rq->throttled_timestamp = 0;
1690
1691 update_rq_clock(rq);
1692 /* update hierarchical throttle state */
1693 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1694
1695 if (!cfs_rq->load.weight)
1696 return;
1697
1698 task_delta = cfs_rq->h_nr_running;
1699 for_each_sched_entity(se) {
1700 if (se->on_rq)
1701 enqueue = 0;
1702
1703 cfs_rq = cfs_rq_of(se);
1704 if (enqueue)
1705 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1706 cfs_rq->h_nr_running += task_delta;
1707
1708 if (cfs_rq_throttled(cfs_rq))
1709 break;
1710 }
1711
1712 if (!se)
1713 rq->nr_running += task_delta;
1714
1715 /* determine whether we need to wake up potentially idle cpu */
1716 if (rq->curr == rq->idle && rq->cfs.nr_running)
1717 resched_task(rq->curr);
1718}
1719
1720static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1721 u64 remaining, u64 expires)
1722{
1723 struct cfs_rq *cfs_rq;
1724 u64 runtime = remaining;
1725
1726 rcu_read_lock();
1727 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1728 throttled_list) {
1729 struct rq *rq = rq_of(cfs_rq);
1730
1731 raw_spin_lock(&rq->lock);
1732 if (!cfs_rq_throttled(cfs_rq))
1733 goto next;
1734
1735 runtime = -cfs_rq->runtime_remaining + 1;
1736 if (runtime > remaining)
1737 runtime = remaining;
1738 remaining -= runtime;
1739
1740 cfs_rq->runtime_remaining += runtime;
1741 cfs_rq->runtime_expires = expires;
1742
1743 /* we check whether we're throttled above */
1744 if (cfs_rq->runtime_remaining > 0)
1745 unthrottle_cfs_rq(cfs_rq);
1746
1747next:
1748 raw_spin_unlock(&rq->lock);
1749
1750 if (!remaining)
1751 break;
1752 }
1753 rcu_read_unlock();
1754
1755 return remaining;
1756}
1757
1758/*
1759 * Responsible for refilling a task_group's bandwidth and unthrottling its
1760 * cfs_rqs as appropriate. If there has been no activity within the last
1761 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1762 * used to track this state.
1763 */
1764static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1765{
1766 u64 runtime, runtime_expires;
1767 int idle = 1, throttled;
1768
1769 raw_spin_lock(&cfs_b->lock);
1770 /* no need to continue the timer with no bandwidth constraint */
1771 if (cfs_b->quota == RUNTIME_INF)
1772 goto out_unlock;
1773
1774 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1775 /* idle depends on !throttled (for the case of a large deficit) */
1776 idle = cfs_b->idle && !throttled;
1777 cfs_b->nr_periods += overrun;
1778
1779 /* if we're going inactive then everything else can be deferred */
1780 if (idle)
1781 goto out_unlock;
1782
1783 __refill_cfs_bandwidth_runtime(cfs_b);
1784
1785 if (!throttled) {
1786 /* mark as potentially idle for the upcoming period */
1787 cfs_b->idle = 1;
1788 goto out_unlock;
1789 }
1790
1791 /* account preceding periods in which throttling occurred */
1792 cfs_b->nr_throttled += overrun;
1793
1794 /*
1795 * There are throttled entities so we must first use the new bandwidth
1796 * to unthrottle them before making it generally available. This
1797 * ensures that all existing debts will be paid before a new cfs_rq is
1798 * allowed to run.
1799 */
1800 runtime = cfs_b->runtime;
1801 runtime_expires = cfs_b->runtime_expires;
1802 cfs_b->runtime = 0;
1803
1804 /*
1805 * This check is repeated as we are holding onto the new bandwidth
1806 * while we unthrottle. This can potentially race with an unthrottled
1807 * group trying to acquire new bandwidth from the global pool.
1808 */
1809 while (throttled && runtime > 0) {
1810 raw_spin_unlock(&cfs_b->lock);
1811 /* we can't nest cfs_b->lock while distributing bandwidth */
1812 runtime = distribute_cfs_runtime(cfs_b, runtime,
1813 runtime_expires);
1814 raw_spin_lock(&cfs_b->lock);
1815
1816 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1817 }
1818
1819 /* return (any) remaining runtime */
1820 cfs_b->runtime = runtime;
1821 /*
1822 * While we are ensured activity in the period following an
1823 * unthrottle, this also covers the case in which the new bandwidth is
1824 * insufficient to cover the existing bandwidth deficit. (Forcing the
1825 * timer to remain active while there are any throttled entities.)
1826 */
1827 cfs_b->idle = 0;
1828out_unlock:
1829 if (idle)
1830 cfs_b->timer_active = 0;
1831 raw_spin_unlock(&cfs_b->lock);
1832
1833 return idle;
1834}
1835
1836/* a cfs_rq won't donate quota below this amount */
1837static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1838/* minimum remaining period time to redistribute slack quota */
1839static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1840/* how long we wait to gather additional slack before distributing */
1841static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1842
1843/* are we near the end of the current quota period? */
1844static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1845{
1846 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1847 u64 remaining;
1848
1849 /* if the call-back is running a quota refresh is already occurring */
1850 if (hrtimer_callback_running(refresh_timer))
1851 return 1;
1852
1853 /* is a quota refresh about to occur? */
1854 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1855 if (remaining < min_expire)
1856 return 1;
1857
1858 return 0;
1859}
1860
1861static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1862{
1863 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1864
1865 /* if there's a quota refresh soon don't bother with slack */
1866 if (runtime_refresh_within(cfs_b, min_left))
1867 return;
1868
1869 start_bandwidth_timer(&cfs_b->slack_timer,
1870 ns_to_ktime(cfs_bandwidth_slack_period));
1871}
1872
1873/* we know any runtime found here is valid as update_curr() precedes return */
1874static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1875{
1876 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1877 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1878
1879 if (slack_runtime <= 0)
1880 return;
1881
1882 raw_spin_lock(&cfs_b->lock);
1883 if (cfs_b->quota != RUNTIME_INF &&
1884 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1885 cfs_b->runtime += slack_runtime;
1886
1887 /* we are under rq->lock, defer unthrottling using a timer */
1888 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1889 !list_empty(&cfs_b->throttled_cfs_rq))
1890 start_cfs_slack_bandwidth(cfs_b);
1891 }
1892 raw_spin_unlock(&cfs_b->lock);
1893
1894 /* even if it's not valid for return we don't want to try again */
1895 cfs_rq->runtime_remaining -= slack_runtime;
1896}
1897
1898static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1899{
1900 if (!cfs_bandwidth_used())
1901 return;
1902
1903 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1904 return;
1905
1906 __return_cfs_rq_runtime(cfs_rq);
1907}
1908
1909/*
1910 * This is done with a timer (instead of inline with bandwidth return) since
1911 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1912 */
1913static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1914{
1915 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1916 u64 expires;
1917
1918 /* confirm we're still not at a refresh boundary */
1919 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1920 return;
1921
1922 raw_spin_lock(&cfs_b->lock);
1923 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1924 runtime = cfs_b->runtime;
1925 cfs_b->runtime = 0;
1926 }
1927 expires = cfs_b->runtime_expires;
1928 raw_spin_unlock(&cfs_b->lock);
1929
1930 if (!runtime)
1931 return;
1932
1933 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1934
1935 raw_spin_lock(&cfs_b->lock);
1936 if (expires == cfs_b->runtime_expires)
1937 cfs_b->runtime = runtime;
1938 raw_spin_unlock(&cfs_b->lock);
1939}
1940
1941/*
1942 * When a group wakes up we want to make sure that its quota is not already
1943 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1944 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1945 */
1946static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1947{
1948 if (!cfs_bandwidth_used())
1949 return;
1950
1951 /* an active group must be handled by the update_curr()->put() path */
1952 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1953 return;
1954
1955 /* ensure the group is not already throttled */
1956 if (cfs_rq_throttled(cfs_rq))
1957 return;
1958
1959 /* update runtime allocation */
1960 account_cfs_rq_runtime(cfs_rq, 0);
1961 if (cfs_rq->runtime_remaining <= 0)
1962 throttle_cfs_rq(cfs_rq);
1963}
1964
1965/* conditionally throttle active cfs_rq's from put_prev_entity() */
1966static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1967{
1968 if (!cfs_bandwidth_used())
1969 return;
1970
1971 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1972 return;
1973
1974 /*
1975 * it's possible for a throttled entity to be forced into a running
1976 * state (e.g. set_curr_task), in this case we're finished.
1977 */
1978 if (cfs_rq_throttled(cfs_rq))
1979 return;
1980
1981 throttle_cfs_rq(cfs_rq);
1982}
1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2090 unsigned long delta_exec) {}
2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2092static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2093static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2094
2095static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2096{
2097 return 0;
2098}
2099
2100static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2101{
2102 return 0;
2103}
2104
2105static inline int throttled_lb_pair(struct task_group *tg,
2106 int src_cpu, int dest_cpu)
2107{
2108 return 0;
2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2115#endif
2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1240/************************************************** 2126/**************************************************
1241 * CFS operations on tasks: 2127 * CFS operations on tasks:
1242 */ 2128 */
@@ -1249,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1249 2135
1250 WARN_ON(task_rq(p) != rq); 2136 WARN_ON(task_rq(p) != rq);
1251 2137
1252 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2138 if (cfs_rq->nr_running > 1) {
1253 u64 slice = sched_slice(cfs_rq, se); 2139 u64 slice = sched_slice(cfs_rq, se);
1254 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2140 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1255 s64 delta = slice - ran; 2141 s64 delta = slice - ran;
@@ -1280,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
1280{ 2166{
1281 struct task_struct *curr = rq->curr; 2167 struct task_struct *curr = rq->curr;
1282 2168
1283 if (curr->sched_class != &fair_sched_class) 2169 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1284 return; 2170 return;
1285 2171
1286 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2172 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -1313,16 +2199,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 2199 break;
1314 cfs_rq = cfs_rq_of(se); 2200 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 2201 enqueue_entity(cfs_rq, se, flags);
2202
2203 /*
2204 * end evaluation on encountering a throttled cfs_rq
2205 *
2206 * note: in the case of encountering a throttled cfs_rq we will
2207 * post the final h_nr_running increment below.
2208 */
2209 if (cfs_rq_throttled(cfs_rq))
2210 break;
2211 cfs_rq->h_nr_running++;
2212
1316 flags = ENQUEUE_WAKEUP; 2213 flags = ENQUEUE_WAKEUP;
1317 } 2214 }
1318 2215
1319 for_each_sched_entity(se) { 2216 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 2217 cfs_rq = cfs_rq_of(se);
2218 cfs_rq->h_nr_running++;
2219
2220 if (cfs_rq_throttled(cfs_rq))
2221 break;
1321 2222
1322 update_cfs_load(cfs_rq, 0); 2223 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 2224 update_cfs_shares(cfs_rq);
1324 } 2225 }
1325 2226
2227 if (!se)
2228 inc_nr_running(rq);
1326 hrtick_update(rq); 2229 hrtick_update(rq);
1327} 2230}
1328 2231
@@ -1343,6 +2246,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 2246 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 2247 dequeue_entity(cfs_rq, se, flags);
1345 2248
2249 /*
2250 * end evaluation on encountering a throttled cfs_rq
2251 *
2252 * note: in the case of encountering a throttled cfs_rq we will
2253 * post the final h_nr_running decrement below.
2254 */
2255 if (cfs_rq_throttled(cfs_rq))
2256 break;
2257 cfs_rq->h_nr_running--;
2258
1346 /* Don't dequeue parent if it has other entities besides us */ 2259 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 2260 if (cfs_rq->load.weight) {
1348 /* 2261 /*
@@ -1361,15 +2274,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 2274
1362 for_each_sched_entity(se) { 2275 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 2276 cfs_rq = cfs_rq_of(se);
2277 cfs_rq->h_nr_running--;
2278
2279 if (cfs_rq_throttled(cfs_rq))
2280 break;
1364 2281
1365 update_cfs_load(cfs_rq, 0); 2282 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2283 update_cfs_shares(cfs_rq);
1367 } 2284 }
1368 2285
2286 if (!se)
2287 dec_nr_running(rq);
1369 hrtick_update(rq); 2288 hrtick_update(rq);
1370} 2289}
1371 2290
1372#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
1373 2347
1374static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
1375{ 2349{
@@ -1399,42 +2373,105 @@ static void task_waking_fair(struct task_struct *p)
1399 * Adding load to a group doesn't make a group heavier, but can cause movement 2373 * Adding load to a group doesn't make a group heavier, but can cause movement
1400 * of group shares between cpus. Assuming the shares were perfectly aligned one 2374 * of group shares between cpus. Assuming the shares were perfectly aligned one
1401 * can calculate the shift in shares. 2375 * can calculate the shift in shares.
2376 *
2377 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2378 * on this @cpu and results in a total addition (subtraction) of @wg to the
2379 * total group weight.
2380 *
2381 * Given a runqueue weight distribution (rw_i) we can compute a shares
2382 * distribution (s_i) using:
2383 *
2384 * s_i = rw_i / \Sum rw_j (1)
2385 *
2386 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2387 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2388 * shares distribution (s_i):
2389 *
2390 * rw_i = { 2, 4, 1, 0 }
2391 * s_i = { 2/7, 4/7, 1/7, 0 }
2392 *
2393 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2394 * task used to run on and the CPU the waker is running on), we need to
2395 * compute the effect of waking a task on either CPU and, in case of a sync
2396 * wakeup, compute the effect of the current task going to sleep.
2397 *
2398 * So for a change of @wl to the local @cpu with an overall group weight change
2399 * of @wl we can compute the new shares distribution (s'_i) using:
2400 *
2401 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
2402 *
2403 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2404 * differences in waking a task to CPU 0. The additional task changes the
2405 * weight and shares distributions like:
2406 *
2407 * rw'_i = { 3, 4, 1, 0 }
2408 * s'_i = { 3/8, 4/8, 1/8, 0 }
2409 *
2410 * We can then compute the difference in effective weight by using:
2411 *
2412 * dw_i = S * (s'_i - s_i) (3)
2413 *
2414 * Where 'S' is the group weight as seen by its parent.
2415 *
2416 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2417 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2418 * 4/7) times the weight of the group.
1402 */ 2419 */
1403static long effective_load(struct task_group *tg, int cpu, long wl, long wg) 2420static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1404{ 2421{
1405 struct sched_entity *se = tg->se[cpu]; 2422 struct sched_entity *se = tg->se[cpu];
1406 2423
1407 if (!tg->parent) 2424 if (!tg->parent) /* the trivial, non-cgroup case */
1408 return wl; 2425 return wl;
1409 2426
1410 for_each_sched_entity(se) { 2427 for_each_sched_entity(se) {
1411 long lw, w; 2428 long w, W;
1412 2429
1413 tg = se->my_q->tg; 2430 tg = se->my_q->tg;
1414 w = se->my_q->load.weight;
1415 2431
1416 /* use this cpu's instantaneous contribution */ 2432 /*
1417 lw = atomic_read(&tg->load_weight); 2433 * W = @wg + \Sum rw_j
1418 lw -= se->my_q->load_contribution; 2434 */
1419 lw += w + wg; 2435 W = wg + calc_tg_weight(tg, se->my_q);
1420 2436
1421 wl += w; 2437 /*
2438 * w = rw_i + @wl
2439 */
2440 w = se->my_q->load.weight + wl;
1422 2441
1423 if (lw > 0 && wl < lw) 2442 /*
1424 wl = (wl * tg->shares) / lw; 2443 * wl = S * s'_i; see (2)
2444 */
2445 if (W > 0 && w < W)
2446 wl = (w * tg->shares) / W;
1425 else 2447 else
1426 wl = tg->shares; 2448 wl = tg->shares;
1427 2449
1428 /* zero point is MIN_SHARES */ 2450 /*
2451 * Per the above, wl is the new se->load.weight value; since
2452 * those are clipped to [MIN_SHARES, ...) do so now. See
2453 * calc_cfs_shares().
2454 */
1429 if (wl < MIN_SHARES) 2455 if (wl < MIN_SHARES)
1430 wl = MIN_SHARES; 2456 wl = MIN_SHARES;
2457
2458 /*
2459 * wl = dw_i = S * (s'_i - s_i); see (3)
2460 */
1431 wl -= se->load.weight; 2461 wl -= se->load.weight;
2462
2463 /*
2464 * Recursively apply this logic to all parent groups to compute
2465 * the final effective load change on the root group. Since
2466 * only the @tg group gets extra weight, all parent groups can
2467 * only redistribute existing shares. @wl is the shift in shares
2468 * resulting from this level per the above.
2469 */
1432 wg = 0; 2470 wg = 0;
1433 } 2471 }
1434 2472
1435 return wl; 2473 return wl;
1436} 2474}
1437
1438#else 2475#else
1439 2476
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2477static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2584
1548 /* Skip over this group if it has no CPUs allowed */ 2585 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2586 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2587 tsk_cpus_allowed(p)))
1551 continue; 2588 continue;
1552 2589
1553 local_group = cpumask_test_cpu(this_cpu, 2590 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2630,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2630 int i;
1594 2631
1595 /* Traverse only the allowed CPUs */ 2632 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2633 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2634 load = weighted_cpuload(i);
1598 2635
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2636 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1613,6 +2650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1613 int cpu = smp_processor_id(); 2650 int cpu = smp_processor_id();
1614 int prev_cpu = task_cpu(p); 2651 int prev_cpu = task_cpu(p);
1615 struct sched_domain *sd; 2652 struct sched_domain *sd;
2653 struct sched_group *sg;
1616 int i; 2654 int i;
1617 2655
1618 /* 2656 /*
@@ -1633,25 +2671,28 @@ static int select_idle_sibling(struct task_struct *p, int target)
1633 * Otherwise, iterate the domains and find an elegible idle cpu. 2671 * Otherwise, iterate the domains and find an elegible idle cpu.
1634 */ 2672 */
1635 rcu_read_lock(); 2673 rcu_read_lock();
1636 for_each_domain(target, sd) {
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1638 break;
1639 2674
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2675 sd = rcu_dereference(per_cpu(sd_llc, target));
1641 if (idle_cpu(i)) { 2676 for_each_lower_domain(sd) {
1642 target = i; 2677 sg = sd->groups;
1643 break; 2678 do {
2679 if (!cpumask_intersects(sched_group_cpus(sg),
2680 tsk_cpus_allowed(p)))
2681 goto next;
2682
2683 for_each_cpu(i, sched_group_cpus(sg)) {
2684 if (!idle_cpu(i))
2685 goto next;
1644 } 2686 }
1645 }
1646 2687
1647 /* 2688 target = cpumask_first_and(sched_group_cpus(sg),
1648 * Lets stop looking for an idle sibling when we reached 2689 tsk_cpus_allowed(p));
1649 * the domain that spans the current cpu and prev_cpu. 2690 goto done;
1650 */ 2691next:
1651 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && 2692 sg = sg->next;
1652 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 2693 } while (sg != sd->groups);
1653 break;
1654 } 2694 }
2695done:
1655 rcu_read_unlock(); 2696 rcu_read_unlock();
1656 2697
1657 return target; 2698 return target;
@@ -1679,8 +2720,11 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1679 int want_sd = 1; 2720 int want_sd = 1;
1680 int sync = wake_flags & WF_SYNC; 2721 int sync = wake_flags & WF_SYNC;
1681 2722
2723 if (p->rt.nr_cpus_allowed == 1)
2724 return prev_cpu;
2725
1682 if (sd_flag & SD_BALANCE_WAKE) { 2726 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2727 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2728 want_affine = 1;
1685 new_cpu = prev_cpu; 2729 new_cpu = prev_cpu;
1686 } 2730 }
@@ -1875,6 +2919,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2919 if (unlikely(se == pse))
1876 return; 2920 return;
1877 2921
2922 /*
2923 * This is possible from callers such as pull_task(), in which we
2924 * unconditionally check_prempt_curr() after an enqueue (which may have
2925 * lead to a throttle). This both saves work and prevents false
2926 * next-buddy nomination below.
2927 */
2928 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2929 return;
2930
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2931 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2932 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2933 next_buddy_marked = 1;
@@ -1883,6 +2936,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2936 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2937 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2938 * wake up path.
2939 *
2940 * Note: this also catches the edge-case of curr being in a throttled
2941 * group (e.g. via set_curr_task), since update_curr() (in the
2942 * enqueue of curr) will have resulted in resched being set. This
2943 * prevents us from potentially nominating it as a false LAST_BUDDY
2944 * below.
1886 */ 2945 */
1887 if (test_tsk_need_resched(curr)) 2946 if (test_tsk_need_resched(curr))
1888 return; 2947 return;
@@ -1899,10 +2958,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2958 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2959 return;
1901 2960
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2961 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2962 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2963 BUG_ON(!pse);
@@ -1952,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1952 } while (cfs_rq); 3007 } while (cfs_rq);
1953 3008
1954 p = task_of(se); 3009 p = task_of(se);
1955 hrtick_start_fair(rq, p); 3010 if (hrtick_enabled(rq))
3011 hrtick_start_fair(rq, p);
1956 3012
1957 return p; 3013 return p;
1958} 3014}
@@ -1996,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
1996 * Update run-time statistics of the 'current'. 3052 * Update run-time statistics of the 'current'.
1997 */ 3053 */
1998 update_curr(cfs_rq); 3054 update_curr(cfs_rq);
3055 /*
3056 * Tell update_rq_clock() that we've just updated,
3057 * so we don't do microscopic update in schedule()
3058 * and double the fastpath cost.
3059 */
3060 rq->skip_clock_update = 1;
1999 } 3061 }
2000 3062
2001 set_skip_buddy(se); 3063 set_skip_buddy(se);
@@ -2005,7 +3067,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 3067{
2006 struct sched_entity *se = &p->se; 3068 struct sched_entity *se = &p->se;
2007 3069
2008 if (!se->on_rq) 3070 /* throttled hierarchies are not runnable */
3071 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 3072 return false;
2010 3073
2011 /* Tell the scheduler that we'd really like pse to run next. */ 3074 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2035,12 +3098,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2035} 3098}
2036 3099
2037/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3134#define LBF_HAD_BREAK 0x04
3135#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3136#define LBF_ABORT 0x10
3137
3138/*
2038 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3139 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2039 */ 3140 */
2040static 3141static
2041int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3142int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2042 struct sched_domain *sd, enum cpu_idle_type idle, 3143 struct sched_domain *sd, enum cpu_idle_type idle,
2043 int *all_pinned) 3144 int *lb_flags)
2044{ 3145{
2045 int tsk_cache_hot = 0; 3146 int tsk_cache_hot = 0;
2046 /* 3147 /*
@@ -2049,11 +3150,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3150 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 3151 * 3) are cache-hot on their current CPU.
2051 */ 3152 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 3153 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3154 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 3155 return 0;
2055 } 3156 }
2056 *all_pinned = 0; 3157 *lb_flags &= ~LBF_ALL_PINNED;
2057 3158
2058 if (task_running(rq, p)) { 3159 if (task_running(rq, p)) {
2059 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3160 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2102,6 +3203,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 3203
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 3204 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 3205 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
3206 if (throttled_lb_pair(task_group(p),
3207 busiest->cpu, this_cpu))
3208 break;
2105 3209
2106 if (!can_migrate_task(p, busiest, this_cpu, 3210 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 3211 sd, idle, &pinned))
@@ -2124,7 +3228,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2124static unsigned long 3228static unsigned long
2125balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3229balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2126 unsigned long max_load_move, struct sched_domain *sd, 3230 unsigned long max_load_move, struct sched_domain *sd,
2127 enum cpu_idle_type idle, int *all_pinned, 3231 enum cpu_idle_type idle, int *lb_flags,
2128 struct cfs_rq *busiest_cfs_rq) 3232 struct cfs_rq *busiest_cfs_rq)
2129{ 3233{
2130 int loops = 0, pulled = 0; 3234 int loops = 0, pulled = 0;
@@ -2135,12 +3239,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2135 goto out; 3239 goto out;
2136 3240
2137 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3241 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2138 if (loops++ > sysctl_sched_nr_migrate) 3242 if (loops++ > sysctl_sched_nr_migrate) {
3243 *lb_flags |= LBF_NEED_BREAK;
2139 break; 3244 break;
3245 }
2140 3246
2141 if ((p->se.load.weight >> 1) > rem_load_move || 3247 if ((p->se.load.weight >> 1) > rem_load_move ||
2142 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3248 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2143 all_pinned)) 3249 lb_flags))
2144 continue; 3250 continue;
2145 3251
2146 pull_task(busiest, p, this_rq, this_cpu); 3252 pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,8 +3259,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2153 * kernels will stop after the first task is pulled to minimize 3259 * kernels will stop after the first task is pulled to minimize
2154 * the critical section. 3260 * the critical section.
2155 */ 3261 */
2156 if (idle == CPU_NEWLY_IDLE) 3262 if (idle == CPU_NEWLY_IDLE) {
3263 *lb_flags |= LBF_ABORT;
2157 break; 3264 break;
3265 }
2158#endif 3266#endif
2159 3267
2160 /* 3268 /*
@@ -2217,8 +3325,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 3325 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 3326 * list_add_leaf_cfs_rq() for details.
2219 */ 3327 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 3328 for_each_leaf_cfs_rq(rq, cfs_rq) {
3329 /* throttled entities do not contribute to load */
3330 if (throttled_hierarchy(cfs_rq))
3331 continue;
3332
2221 update_shares_cpu(cfs_rq->tg, cpu); 3333 update_shares_cpu(cfs_rq->tg, cpu);
3334 }
2222 rcu_read_unlock(); 3335 rcu_read_unlock();
2223} 3336}
2224 3337
@@ -2254,7 +3367,7 @@ static unsigned long
2254load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3367load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2255 unsigned long max_load_move, 3368 unsigned long max_load_move,
2256 struct sched_domain *sd, enum cpu_idle_type idle, 3369 struct sched_domain *sd, enum cpu_idle_type idle,
2257 int *all_pinned) 3370 int *lb_flags)
2258{ 3371{
2259 long rem_load_move = max_load_move; 3372 long rem_load_move = max_load_move;
2260 struct cfs_rq *busiest_cfs_rq; 3373 struct cfs_rq *busiest_cfs_rq;
@@ -2267,17 +3380,21 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2267 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3380 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2268 u64 rem_load, moved_load; 3381 u64 rem_load, moved_load;
2269 3382
3383 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3384 break;
3385
2270 /* 3386 /*
2271 * empty group 3387 * empty group or part of a throttled hierarchy
2272 */ 3388 */
2273 if (!busiest_cfs_rq->task_weight) 3389 if (!busiest_cfs_rq->task_weight ||
3390 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 3391 continue;
2275 3392
2276 rem_load = (u64)rem_load_move * busiest_weight; 3393 rem_load = (u64)rem_load_move * busiest_weight;
2277 rem_load = div_u64(rem_load, busiest_h_load + 1); 3394 rem_load = div_u64(rem_load, busiest_h_load + 1);
2278 3395
2279 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3396 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2280 rem_load, sd, idle, all_pinned, 3397 rem_load, sd, idle, lb_flags,
2281 busiest_cfs_rq); 3398 busiest_cfs_rq);
2282 3399
2283 if (!moved_load) 3400 if (!moved_load)
@@ -2303,10 +3420,10 @@ static unsigned long
2303load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3420load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2304 unsigned long max_load_move, 3421 unsigned long max_load_move,
2305 struct sched_domain *sd, enum cpu_idle_type idle, 3422 struct sched_domain *sd, enum cpu_idle_type idle,
2306 int *all_pinned) 3423 int *lb_flags)
2307{ 3424{
2308 return balance_tasks(this_rq, this_cpu, busiest, 3425 return balance_tasks(this_rq, this_cpu, busiest,
2309 max_load_move, sd, idle, all_pinned, 3426 max_load_move, sd, idle, lb_flags,
2310 &busiest->cfs); 3427 &busiest->cfs);
2311} 3428}
2312#endif 3429#endif
@@ -2321,29 +3438,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2321static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3438static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2322 unsigned long max_load_move, 3439 unsigned long max_load_move,
2323 struct sched_domain *sd, enum cpu_idle_type idle, 3440 struct sched_domain *sd, enum cpu_idle_type idle,
2324 int *all_pinned) 3441 int *lb_flags)
2325{ 3442{
2326 unsigned long total_load_moved = 0, load_moved; 3443 unsigned long total_load_moved = 0, load_moved;
2327 3444
2328 do { 3445 do {
2329 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3446 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2330 max_load_move - total_load_moved, 3447 max_load_move - total_load_moved,
2331 sd, idle, all_pinned); 3448 sd, idle, lb_flags);
2332 3449
2333 total_load_moved += load_moved; 3450 total_load_moved += load_moved;
2334 3451
3452 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3453 break;
3454
2335#ifdef CONFIG_PREEMPT 3455#ifdef CONFIG_PREEMPT
2336 /* 3456 /*
2337 * NEWIDLE balancing is a source of latency, so preemptible 3457 * NEWIDLE balancing is a source of latency, so preemptible
2338 * kernels will stop after the first task is pulled to minimize 3458 * kernels will stop after the first task is pulled to minimize
2339 * the critical section. 3459 * the critical section.
2340 */ 3460 */
2341 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3461 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
2342 break; 3462 *lb_flags |= LBF_ABORT;
2343
2344 if (raw_spin_is_contended(&this_rq->lock) ||
2345 raw_spin_is_contended(&busiest->lock))
2346 break; 3463 break;
3464 }
2347#endif 3465#endif
2348 } while (load_moved && max_load_move > total_load_moved); 3466 } while (load_moved && max_load_move > total_load_moved);
2349 3467
@@ -2405,15 +3523,6 @@ struct sg_lb_stats {
2405}; 3523};
2406 3524
2407/** 3525/**
2408 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2409 * @group: The group whose first cpu is to be returned.
2410 */
2411static inline unsigned int group_first_cpu(struct sched_group *group)
2412{
2413 return cpumask_first(sched_group_cpus(group));
2414}
2415
2416/**
2417 * get_sd_load_idx - Obtain the load index for a given sched domain. 3526 * get_sd_load_idx - Obtain the load index for a given sched domain.
2418 * @sd: The sched_domain whose load_idx is to be obtained. 3527 * @sd: The sched_domain whose load_idx is to be obtained.
2419 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3528 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -2662,7 +3771,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2662 sdg->sgp->power = power; 3771 sdg->sgp->power = power;
2663} 3772}
2664 3773
2665static void update_group_power(struct sched_domain *sd, int cpu) 3774void update_group_power(struct sched_domain *sd, int cpu)
2666{ 3775{
2667 struct sched_domain *child = sd->child; 3776 struct sched_domain *child = sd->child;
2668 struct sched_group *group, *sdg = sd->groups; 3777 struct sched_group *group, *sdg = sd->groups;
@@ -2854,7 +3963,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2854} 3963}
2855 3964
2856/** 3965/**
2857 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 3966 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
2858 * @sd: sched_domain whose statistics are to be updated. 3967 * @sd: sched_domain whose statistics are to be updated.
2859 * @this_cpu: Cpu for which load balance is currently performed. 3968 * @this_cpu: Cpu for which load balance is currently performed.
2860 * @idle: Idle status of this_cpu 3969 * @idle: Idle status of this_cpu
@@ -2928,11 +4037,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2928 } while (sg != sd->groups); 4037 } while (sg != sd->groups);
2929} 4038}
2930 4039
2931int __weak arch_sd_sibling_asym_packing(void)
2932{
2933 return 0*SD_ASYM_PACKING;
2934}
2935
2936/** 4040/**
2937 * check_asym_packing - Check to see if the group is packed into the 4041 * check_asym_packing - Check to see if the group is packed into the
2938 * sched doman. 4042 * sched doman.
@@ -3296,7 +4400,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3296#define MAX_PINNED_INTERVAL 512 4400#define MAX_PINNED_INTERVAL 512
3297 4401
3298/* Working cpumask for load_balance and load_balance_newidle. */ 4402/* Working cpumask for load_balance and load_balance_newidle. */
3299static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4403DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3300 4404
3301static int need_active_balance(struct sched_domain *sd, int idle, 4405static int need_active_balance(struct sched_domain *sd, int idle,
3302 int busiest_cpu, int this_cpu) 4406 int busiest_cpu, int this_cpu)
@@ -3347,7 +4451,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3347 struct sched_domain *sd, enum cpu_idle_type idle, 4451 struct sched_domain *sd, enum cpu_idle_type idle,
3348 int *balance) 4452 int *balance)
3349{ 4453{
3350 int ld_moved, all_pinned = 0, active_balance = 0; 4454 int ld_moved, lb_flags = 0, active_balance = 0;
3351 struct sched_group *group; 4455 struct sched_group *group;
3352 unsigned long imbalance; 4456 unsigned long imbalance;
3353 struct rq *busiest; 4457 struct rq *busiest;
@@ -3388,11 +4492,11 @@ redo:
3388 * still unbalanced. ld_moved simply stays zero, so it is 4492 * still unbalanced. ld_moved simply stays zero, so it is
3389 * correctly treated as an imbalance. 4493 * correctly treated as an imbalance.
3390 */ 4494 */
3391 all_pinned = 1; 4495 lb_flags |= LBF_ALL_PINNED;
3392 local_irq_save(flags); 4496 local_irq_save(flags);
3393 double_rq_lock(this_rq, busiest); 4497 double_rq_lock(this_rq, busiest);
3394 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4498 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3395 imbalance, sd, idle, &all_pinned); 4499 imbalance, sd, idle, &lb_flags);
3396 double_rq_unlock(this_rq, busiest); 4500 double_rq_unlock(this_rq, busiest);
3397 local_irq_restore(flags); 4501 local_irq_restore(flags);
3398 4502
@@ -3402,8 +4506,18 @@ redo:
3402 if (ld_moved && this_cpu != smp_processor_id()) 4506 if (ld_moved && this_cpu != smp_processor_id())
3403 resched_cpu(this_cpu); 4507 resched_cpu(this_cpu);
3404 4508
4509 if (lb_flags & LBF_ABORT)
4510 goto out_balanced;
4511
4512 if (lb_flags & LBF_NEED_BREAK) {
4513 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4514 if (lb_flags & LBF_ABORT)
4515 goto out_balanced;
4516 goto redo;
4517 }
4518
3405 /* All tasks on this runqueue were pinned by CPU affinity */ 4519 /* All tasks on this runqueue were pinned by CPU affinity */
3406 if (unlikely(all_pinned)) { 4520 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
3407 cpumask_clear_cpu(cpu_of(busiest), cpus); 4521 cpumask_clear_cpu(cpu_of(busiest), cpus);
3408 if (!cpumask_empty(cpus)) 4522 if (!cpumask_empty(cpus))
3409 goto redo; 4523 goto redo;
@@ -3430,10 +4544,10 @@ redo:
3430 * moved to this_cpu 4544 * moved to this_cpu
3431 */ 4545 */
3432 if (!cpumask_test_cpu(this_cpu, 4546 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4547 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4548 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4549 flags);
3436 all_pinned = 1; 4550 lb_flags |= LBF_ALL_PINNED;
3437 goto out_one_pinned; 4551 goto out_one_pinned;
3438 } 4552 }
3439 4553
@@ -3486,7 +4600,8 @@ out_balanced:
3486 4600
3487out_one_pinned: 4601out_one_pinned:
3488 /* tune up the balancing interval */ 4602 /* tune up the balancing interval */
3489 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4603 if (((lb_flags & LBF_ALL_PINNED) &&
4604 sd->balance_interval < MAX_PINNED_INTERVAL) ||
3490 (sd->balance_interval < sd->max_interval)) 4605 (sd->balance_interval < sd->max_interval))
3491 sd->balance_interval *= 2; 4606 sd->balance_interval *= 2;
3492 4607
@@ -3499,7 +4614,7 @@ out:
3499 * idle_balance is called by schedule() if this_cpu is about to become 4614 * idle_balance is called by schedule() if this_cpu is about to become
3500 * idle. Attempts to pull tasks from other CPUs. 4615 * idle. Attempts to pull tasks from other CPUs.
3501 */ 4616 */
3502static void idle_balance(int this_cpu, struct rq *this_rq) 4617void idle_balance(int this_cpu, struct rq *this_rq)
3503{ 4618{
3504 struct sched_domain *sd; 4619 struct sched_domain *sd;
3505 int pulled_task = 0; 4620 int pulled_task = 0;
@@ -3612,46 +4727,18 @@ out_unlock:
3612} 4727}
3613 4728
3614#ifdef CONFIG_NO_HZ 4729#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4730/*
3632 * idle load balancing details 4731 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while
3634 * entering idle.
3635 * - This idle load balancer CPU will also go into tickless mode when
3636 * it is idle, just like all other idle CPUs
3637 * - When one of the busy CPUs notice that there may be an idle rebalancing 4732 * - When one of the busy CPUs notice that there may be an idle rebalancing
3638 * needed, they will kick the idle load balancer, which then does idle 4733 * needed, they will kick the idle load balancer, which then does idle
3639 * load balancing for all the idle CPUs. 4734 * load balancing for all the idle CPUs.
3640 */ 4735 */
3641static struct { 4736static struct {
3642 atomic_t load_balancer;
3643 atomic_t first_pick_cpu;
3644 atomic_t second_pick_cpu;
3645 cpumask_var_t idle_cpus_mask; 4737 cpumask_var_t idle_cpus_mask;
3646 cpumask_var_t grp_idle_mask; 4738 atomic_t nr_cpus;
3647 unsigned long next_balance; /* in jiffy units */ 4739 unsigned long next_balance; /* in jiffy units */
3648} nohz ____cacheline_aligned; 4740} nohz ____cacheline_aligned;
3649 4741
3650int get_nohz_load_balancer(void)
3651{
3652 return atomic_read(&nohz.load_balancer);
3653}
3654
3655#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4742#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3656/** 4743/**
3657 * lowest_flag_domain - Return lowest sched_domain containing flag. 4744 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -3667,7 +4754,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4754 struct sched_domain *sd;
3668 4755
3669 for_each_domain(cpu, sd) 4756 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4757 if (sd->flags & flag)
3671 break; 4758 break;
3672 4759
3673 return sd; 4760 return sd;
@@ -3688,33 +4775,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3688 (sd && (sd->flags & flag)); sd = sd->parent) 4775 (sd && (sd->flags & flag)); sd = sd->parent)
3689 4776
3690/** 4777/**
3691 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3692 * @ilb_group: group to be checked for semi-idleness
3693 *
3694 * Returns: 1 if the group is semi-idle. 0 otherwise.
3695 *
3696 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3697 * and atleast one non-idle CPU. This helper function checks if the given
3698 * sched_group is semi-idle or not.
3699 */
3700static inline int is_semi_idle_group(struct sched_group *ilb_group)
3701{
3702 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3703 sched_group_cpus(ilb_group));
3704
3705 /*
3706 * A sched_group is semi-idle when it has atleast one busy cpu
3707 * and atleast one idle cpu.
3708 */
3709 if (cpumask_empty(nohz.grp_idle_mask))
3710 return 0;
3711
3712 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3713 return 0;
3714
3715 return 1;
3716}
3717/**
3718 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4778 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3719 * @cpu: The cpu which is nominating a new idle_load_balancer. 4779 * @cpu: The cpu which is nominating a new idle_load_balancer.
3720 * 4780 *
@@ -3728,9 +4788,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
3728 */ 4788 */
3729static int find_new_ilb(int cpu) 4789static int find_new_ilb(int cpu)
3730{ 4790{
4791 int ilb = cpumask_first(nohz.idle_cpus_mask);
4792 struct sched_group *ilbg;
3731 struct sched_domain *sd; 4793 struct sched_domain *sd;
3732 struct sched_group *ilb_group;
3733 int ilb = nr_cpu_ids;
3734 4794
3735 /* 4795 /*
3736 * Have idle load balancer selection from semi-idle packages only 4796 * Have idle load balancer selection from semi-idle packages only
@@ -3748,23 +4808,28 @@ static int find_new_ilb(int cpu)
3748 4808
3749 rcu_read_lock(); 4809 rcu_read_lock();
3750 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4810 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3751 ilb_group = sd->groups; 4811 ilbg = sd->groups;
3752 4812
3753 do { 4813 do {
3754 if (is_semi_idle_group(ilb_group)) { 4814 if (ilbg->group_weight !=
3755 ilb = cpumask_first(nohz.grp_idle_mask); 4815 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4816 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4817 sched_group_cpus(ilbg));
3756 goto unlock; 4818 goto unlock;
3757 } 4819 }
3758 4820
3759 ilb_group = ilb_group->next; 4821 ilbg = ilbg->next;
3760 4822
3761 } while (ilb_group != sd->groups); 4823 } while (ilbg != sd->groups);
3762 } 4824 }
3763unlock: 4825unlock:
3764 rcu_read_unlock(); 4826 rcu_read_unlock();
3765 4827
3766out_done: 4828out_done:
3767 return ilb; 4829 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4830 return ilb;
4831
4832 return nr_cpu_ids;
3768} 4833}
3769#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4834#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3770static inline int find_new_ilb(int call_cpu) 4835static inline int find_new_ilb(int call_cpu)
@@ -3784,94 +4849,68 @@ static void nohz_balancer_kick(int cpu)
3784 4849
3785 nohz.next_balance++; 4850 nohz.next_balance++;
3786 4851
3787 ilb_cpu = get_nohz_load_balancer(); 4852 ilb_cpu = find_new_ilb(cpu);
3788
3789 if (ilb_cpu >= nr_cpu_ids) {
3790 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3791 if (ilb_cpu >= nr_cpu_ids)
3792 return;
3793 }
3794 4853
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4854 if (ilb_cpu >= nr_cpu_ids)
3796 struct call_single_data *cp; 4855 return;
3797 4856
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4857 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4858 return;
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4859 /*
3801 } 4860 * Use smp_send_reschedule() instead of resched_cpu().
4861 * This way we generate a sched IPI on the target cpu which
4862 * is idle. And the softirq performing nohz idle load balance
4863 * will be run before returning from the IPI.
4864 */
4865 smp_send_reschedule(ilb_cpu);
3802 return; 4866 return;
3803} 4867}
3804 4868
3805/* 4869static inline void set_cpu_sd_state_busy(void)
3806 * This routine will try to nominate the ilb (idle load balancing)
3807 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3808 * load balancing on behalf of all those cpus.
3809 *
3810 * When the ilb owner becomes busy, we will not have new ilb owner until some
3811 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3812 * idle load balancing by kicking one of the idle CPUs.
3813 *
3814 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3815 * ilb owner CPU in future (when there is a need for idle load balancing on
3816 * behalf of all idle CPUs).
3817 */
3818void select_nohz_load_balancer(int stop_tick)
3819{ 4870{
4871 struct sched_domain *sd;
3820 int cpu = smp_processor_id(); 4872 int cpu = smp_processor_id();
3821 4873
3822 if (stop_tick) { 4874 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
3823 if (!cpu_active(cpu)) { 4875 return;
3824 if (atomic_read(&nohz.load_balancer) != cpu) 4876 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
3825 return;
3826
3827 /*
3828 * If we are going offline and still the leader,
3829 * give up!
3830 */
3831 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3832 nr_cpu_ids) != cpu)
3833 BUG();
3834 4877
3835 return; 4878 rcu_read_lock();
3836 } 4879 for_each_domain(cpu, sd)
4880 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4881 rcu_read_unlock();
4882}
3837 4883
3838 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4884void set_cpu_sd_state_idle(void)
4885{
4886 struct sched_domain *sd;
4887 int cpu = smp_processor_id();
3839 4888
3840 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4889 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
3841 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4890 return;
3842 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4891 set_bit(NOHZ_IDLE, nohz_flags(cpu));
3843 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3844 4892
3845 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4893 rcu_read_lock();
3846 int new_ilb; 4894 for_each_domain(cpu, sd)
4895 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4896 rcu_read_unlock();
4897}
3847 4898
3848 /* make me the ilb owner */ 4899/*
3849 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4900 * This routine will record that this cpu is going idle with tick stopped.
3850 cpu) != nr_cpu_ids) 4901 * This info will be used in performing idle load balancing in the future.
3851 return; 4902 */
4903void select_nohz_load_balancer(int stop_tick)
4904{
4905 int cpu = smp_processor_id();
3852 4906
3853 /* 4907 if (stop_tick) {
3854 * Check to see if there is a more power-efficient 4908 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
3855 * ilb.
3856 */
3857 new_ilb = find_new_ilb(cpu);
3858 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3859 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3860 resched_cpu(new_ilb);
3861 return;
3862 }
3863 return;
3864 }
3865 } else {
3866 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3867 return; 4909 return;
3868 4910
3869 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4911 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3870 4912 atomic_inc(&nohz.nr_cpus);
3871 if (atomic_read(&nohz.load_balancer) == cpu) 4913 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
3872 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3873 nr_cpu_ids) != cpu)
3874 BUG();
3875 } 4914 }
3876 return; 4915 return;
3877} 4916}
@@ -3885,7 +4924,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3885 * Scale the max load_balance interval with the number of CPUs in the system. 4924 * Scale the max load_balance interval with the number of CPUs in the system.
3886 * This trades load-balance latency on larger machines for less cross talk. 4925 * This trades load-balance latency on larger machines for less cross talk.
3887 */ 4926 */
3888static void update_max_interval(void) 4927void update_max_interval(void)
3889{ 4928{
3890 max_load_balance_interval = HZ*num_online_cpus()/10; 4929 max_load_balance_interval = HZ*num_online_cpus()/10;
3891} 4930}
@@ -3977,11 +5016,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3977 struct rq *rq; 5016 struct rq *rq;
3978 int balance_cpu; 5017 int balance_cpu;
3979 5018
3980 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5019 if (idle != CPU_IDLE ||
3981 return; 5020 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5021 goto end;
3982 5022
3983 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5023 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3984 if (balance_cpu == this_cpu) 5024 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
3985 continue; 5025 continue;
3986 5026
3987 /* 5027 /*
@@ -3989,10 +5029,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3989 * work being done for other cpus. Next load 5029 * work being done for other cpus. Next load
3990 * balancing owner will pick it up. 5030 * balancing owner will pick it up.
3991 */ 5031 */
3992 if (need_resched()) { 5032 if (need_resched())
3993 this_rq->nohz_balance_kick = 0;
3994 break; 5033 break;
3995 }
3996 5034
3997 raw_spin_lock_irq(&this_rq->lock); 5035 raw_spin_lock_irq(&this_rq->lock);
3998 update_rq_clock(this_rq); 5036 update_rq_clock(this_rq);
@@ -4006,53 +5044,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4006 this_rq->next_balance = rq->next_balance; 5044 this_rq->next_balance = rq->next_balance;
4007 } 5045 }
4008 nohz.next_balance = this_rq->next_balance; 5046 nohz.next_balance = this_rq->next_balance;
4009 this_rq->nohz_balance_kick = 0; 5047end:
5048 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4010} 5049}
4011 5050
4012/* 5051/*
4013 * Current heuristic for kicking the idle load balancer 5052 * Current heuristic for kicking the idle load balancer in the presence
4014 * - first_pick_cpu is the one of the busy CPUs. It will kick 5053 * of an idle cpu is the system.
4015 * idle load balancer when it has more than one process active. This 5054 * - This rq has more than one task.
4016 * eliminates the need for idle load balancing altogether when we have 5055 * - At any scheduler domain level, this cpu's scheduler group has multiple
4017 * only one running process in the system (common case). 5056 * busy cpu's exceeding the group's power.
4018 * - If there are more than one busy CPU, idle load balancer may have 5057 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4019 * to run for active_load_balance to happen (i.e., two busy CPUs are 5058 * domain span are idle.
4020 * SMT or core siblings and can run better if they move to different
4021 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4022 * which will kick idle load balancer as soon as it has any load.
4023 */ 5059 */
4024static inline int nohz_kick_needed(struct rq *rq, int cpu) 5060static inline int nohz_kick_needed(struct rq *rq, int cpu)
4025{ 5061{
4026 unsigned long now = jiffies; 5062 unsigned long now = jiffies;
4027 int ret; 5063 struct sched_domain *sd;
4028 int first_pick_cpu, second_pick_cpu;
4029 5064
4030 if (time_before(now, nohz.next_balance)) 5065 if (unlikely(idle_cpu(cpu)))
4031 return 0; 5066 return 0;
4032 5067
4033 if (rq->idle_at_tick) 5068 /*
4034 return 0; 5069 * We may be recently in ticked or tickless idle mode. At the first
5070 * busy tick after returning from idle, we will update the busy stats.
5071 */
5072 set_cpu_sd_state_busy();
5073 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5074 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5075 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5076 atomic_dec(&nohz.nr_cpus);
5077 }
4035 5078
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5079 /*
4037 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5080 * None are in tickless mode and hence no need for NOHZ idle load
5081 * balancing.
5082 */
5083 if (likely(!atomic_read(&nohz.nr_cpus)))
5084 return 0;
4038 5085
4039 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5086 if (time_before(now, nohz.next_balance))
4040 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4041 return 0; 5087 return 0;
4042 5088
4043 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5089 if (rq->nr_running >= 2)
4044 if (ret == nr_cpu_ids || ret == cpu) { 5090 goto need_kick;
4045 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5091
4046 if (rq->nr_running > 1) 5092 rcu_read_lock();
4047 return 1; 5093 for_each_domain(cpu, sd) {
4048 } else { 5094 struct sched_group *sg = sd->groups;
4049 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5095 struct sched_group_power *sgp = sg->sgp;
4050 if (ret == nr_cpu_ids || ret == cpu) { 5096 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4051 if (rq->nr_running) 5097
4052 return 1; 5098 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4053 } 5099 goto need_kick_unlock;
5100
5101 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5102 && (cpumask_first_and(nohz.idle_cpus_mask,
5103 sched_domain_span(sd)) < cpu))
5104 goto need_kick_unlock;
5105
5106 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5107 break;
4054 } 5108 }
5109 rcu_read_unlock();
4055 return 0; 5110 return 0;
5111
5112need_kick_unlock:
5113 rcu_read_unlock();
5114need_kick:
5115 return 1;
4056} 5116}
4057#else 5117#else
4058static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5118static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4066,7 +5126,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 5126{
4067 int this_cpu = smp_processor_id(); 5127 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 5128 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 5129 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 5130 CPU_IDLE : CPU_NOT_IDLE;
4071 5131
4072 rebalance_domains(this_cpu, idle); 5132 rebalance_domains(this_cpu, idle);
@@ -4087,14 +5147,14 @@ static inline int on_null_domain(int cpu)
4087/* 5147/*
4088 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5148 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4089 */ 5149 */
4090static inline void trigger_load_balance(struct rq *rq, int cpu) 5150void trigger_load_balance(struct rq *rq, int cpu)
4091{ 5151{
4092 /* Don't need to rebalance while attached to NULL domain */ 5152 /* Don't need to rebalance while attached to NULL domain */
4093 if (time_after_eq(jiffies, rq->next_balance) && 5153 if (time_after_eq(jiffies, rq->next_balance) &&
4094 likely(!on_null_domain(cpu))) 5154 likely(!on_null_domain(cpu)))
4095 raise_softirq(SCHED_SOFTIRQ); 5155 raise_softirq(SCHED_SOFTIRQ);
4096#ifdef CONFIG_NO_HZ 5156#ifdef CONFIG_NO_HZ
4097 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5157 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4098 nohz_balancer_kick(cpu); 5158 nohz_balancer_kick(cpu);
4099#endif 5159#endif
4100} 5160}
@@ -4109,15 +5169,6 @@ static void rq_offline_fair(struct rq *rq)
4109 update_sysctl(); 5169 update_sysctl();
4110} 5170}
4111 5171
4112#else /* CONFIG_SMP */
4113
4114/*
4115 * on UP we do not need to balance between CPUs:
4116 */
4117static inline void idle_balance(int cpu, struct rq *rq)
4118{
4119}
4120
4121#endif /* CONFIG_SMP */ 5172#endif /* CONFIG_SMP */
4122 5173
4123/* 5174/*
@@ -4141,8 +5192,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4141 */ 5192 */
4142static void task_fork_fair(struct task_struct *p) 5193static void task_fork_fair(struct task_struct *p)
4143{ 5194{
4144 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5195 struct cfs_rq *cfs_rq;
4145 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5196 struct sched_entity *se = &p->se, *curr;
4146 int this_cpu = smp_processor_id(); 5197 int this_cpu = smp_processor_id();
4147 struct rq *rq = this_rq(); 5198 struct rq *rq = this_rq();
4148 unsigned long flags; 5199 unsigned long flags;
@@ -4151,6 +5202,9 @@ static void task_fork_fair(struct task_struct *p)
4151 5202
4152 update_rq_clock(rq); 5203 update_rq_clock(rq);
4153 5204
5205 cfs_rq = task_cfs_rq(current);
5206 curr = cfs_rq->curr;
5207
4154 if (unlikely(task_cpu(p) != this_cpu)) { 5208 if (unlikely(task_cpu(p) != this_cpu)) {
4155 rcu_read_lock(); 5209 rcu_read_lock();
4156 __set_task_cpu(p, this_cpu); 5210 __set_task_cpu(p, this_cpu);
@@ -4251,8 +5305,23 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 5305{
4252 struct sched_entity *se = &rq->curr->se; 5306 struct sched_entity *se = &rq->curr->se;
4253 5307
4254 for_each_sched_entity(se) 5308 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 5309 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5310
5311 set_next_entity(cfs_rq, se);
5312 /* ensure bandwidth has been allocated on our new cfs_rq */
5313 account_cfs_rq_runtime(cfs_rq, 0);
5314 }
5315}
5316
5317void init_cfs_rq(struct cfs_rq *cfs_rq)
5318{
5319 cfs_rq->tasks_timeline = RB_ROOT;
5320 INIT_LIST_HEAD(&cfs_rq->tasks);
5321 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5322#ifndef CONFIG_64BIT
5323 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5324#endif
4256} 5325}
4257 5326
4258#ifdef CONFIG_FAIR_GROUP_SCHED 5327#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4271,13 +5340,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
4271 * to another cgroup's rq. This does somewhat interfere with the 5340 * to another cgroup's rq. This does somewhat interfere with the
4272 * fair sleeper stuff for the first placement, but who cares. 5341 * fair sleeper stuff for the first placement, but who cares.
4273 */ 5342 */
5343 /*
5344 * When !on_rq, vruntime of the task has usually NOT been normalized.
5345 * But there are some cases where it has already been normalized:
5346 *
5347 * - Moving a forked child which is waiting for being woken up by
5348 * wake_up_new_task().
5349 * - Moving a task which has been woken up by try_to_wake_up() and
5350 * waiting for actually being woken up by sched_ttwu_pending().
5351 *
5352 * To prevent boost or penalty in the new cfs_rq caused by delta
5353 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5354 */
5355 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5356 on_rq = 1;
5357
4274 if (!on_rq) 5358 if (!on_rq)
4275 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5359 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4276 set_task_rq(p, task_cpu(p)); 5360 set_task_rq(p, task_cpu(p));
4277 if (!on_rq) 5361 if (!on_rq)
4278 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5362 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
4279} 5363}
5364
5365void free_fair_sched_group(struct task_group *tg)
5366{
5367 int i;
5368
5369 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5370
5371 for_each_possible_cpu(i) {
5372 if (tg->cfs_rq)
5373 kfree(tg->cfs_rq[i]);
5374 if (tg->se)
5375 kfree(tg->se[i]);
5376 }
5377
5378 kfree(tg->cfs_rq);
5379 kfree(tg->se);
5380}
5381
5382int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5383{
5384 struct cfs_rq *cfs_rq;
5385 struct sched_entity *se;
5386 int i;
5387
5388 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5389 if (!tg->cfs_rq)
5390 goto err;
5391 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5392 if (!tg->se)
5393 goto err;
5394
5395 tg->shares = NICE_0_LOAD;
5396
5397 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5398
5399 for_each_possible_cpu(i) {
5400 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5401 GFP_KERNEL, cpu_to_node(i));
5402 if (!cfs_rq)
5403 goto err;
5404
5405 se = kzalloc_node(sizeof(struct sched_entity),
5406 GFP_KERNEL, cpu_to_node(i));
5407 if (!se)
5408 goto err_free_rq;
5409
5410 init_cfs_rq(cfs_rq);
5411 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5412 }
5413
5414 return 1;
5415
5416err_free_rq:
5417 kfree(cfs_rq);
5418err:
5419 return 0;
5420}
5421
5422void unregister_fair_sched_group(struct task_group *tg, int cpu)
5423{
5424 struct rq *rq = cpu_rq(cpu);
5425 unsigned long flags;
5426
5427 /*
5428 * Only empty task groups can be destroyed; so we can speculatively
5429 * check on_list without danger of it being re-added.
5430 */
5431 if (!tg->cfs_rq[cpu]->on_list)
5432 return;
5433
5434 raw_spin_lock_irqsave(&rq->lock, flags);
5435 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5436 raw_spin_unlock_irqrestore(&rq->lock, flags);
5437}
5438
5439void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5440 struct sched_entity *se, int cpu,
5441 struct sched_entity *parent)
5442{
5443 struct rq *rq = cpu_rq(cpu);
5444
5445 cfs_rq->tg = tg;
5446 cfs_rq->rq = rq;
5447#ifdef CONFIG_SMP
5448 /* allow initial update_cfs_load() to truncate */
5449 cfs_rq->load_stamp = 1;
4280#endif 5450#endif
5451 init_cfs_rq_runtime(cfs_rq);
5452
5453 tg->cfs_rq[cpu] = cfs_rq;
5454 tg->se[cpu] = se;
5455
5456 /* se could be NULL for root_task_group */
5457 if (!se)
5458 return;
5459
5460 if (!parent)
5461 se->cfs_rq = &rq->cfs;
5462 else
5463 se->cfs_rq = parent->my_q;
5464
5465 se->my_q = cfs_rq;
5466 update_load_set(&se->load, 0);
5467 se->parent = parent;
5468}
5469
5470static DEFINE_MUTEX(shares_mutex);
5471
5472int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5473{
5474 int i;
5475 unsigned long flags;
5476
5477 /*
5478 * We can't change the weight of the root cgroup.
5479 */
5480 if (!tg->se[0])
5481 return -EINVAL;
5482
5483 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5484
5485 mutex_lock(&shares_mutex);
5486 if (tg->shares == shares)
5487 goto done;
5488
5489 tg->shares = shares;
5490 for_each_possible_cpu(i) {
5491 struct rq *rq = cpu_rq(i);
5492 struct sched_entity *se;
5493
5494 se = tg->se[i];
5495 /* Propagate contribution to hierarchy */
5496 raw_spin_lock_irqsave(&rq->lock, flags);
5497 for_each_sched_entity(se)
5498 update_cfs_shares(group_cfs_rq(se));
5499 raw_spin_unlock_irqrestore(&rq->lock, flags);
5500 }
5501
5502done:
5503 mutex_unlock(&shares_mutex);
5504 return 0;
5505}
5506#else /* CONFIG_FAIR_GROUP_SCHED */
5507
5508void free_fair_sched_group(struct task_group *tg) { }
5509
5510int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5511{
5512 return 1;
5513}
5514
5515void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5516
5517#endif /* CONFIG_FAIR_GROUP_SCHED */
5518
4281 5519
4282static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5520static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
4283{ 5521{
@@ -4297,7 +5535,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
4297/* 5535/*
4298 * All the scheduling class methods: 5536 * All the scheduling class methods:
4299 */ 5537 */
4300static const struct sched_class fair_sched_class = { 5538const struct sched_class fair_sched_class = {
4301 .next = &idle_sched_class, 5539 .next = &idle_sched_class,
4302 .enqueue_task = enqueue_task_fair, 5540 .enqueue_task = enqueue_task_fair,
4303 .dequeue_task = dequeue_task_fair, 5541 .dequeue_task = dequeue_task_fair,
@@ -4334,7 +5572,7 @@ static const struct sched_class fair_sched_class = {
4334}; 5572};
4335 5573
4336#ifdef CONFIG_SCHED_DEBUG 5574#ifdef CONFIG_SCHED_DEBUG
4337static void print_cfs_stats(struct seq_file *m, int cpu) 5575void print_cfs_stats(struct seq_file *m, int cpu)
4338{ 5576{
4339 struct cfs_rq *cfs_rq; 5577 struct cfs_rq *cfs_rq;
4340 5578
@@ -4344,3 +5582,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
4344 rcu_read_unlock(); 5582 rcu_read_unlock();
4345} 5583}
4346#endif 5584#endif
5585
5586__init void init_sched_fair_class(void)
5587{
5588#ifdef CONFIG_SMP
5589 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5590
5591#ifdef CONFIG_NO_HZ
5592 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5593#endif
5594#endif /* SMP */
5595
5596}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 2e74677cb040..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,18 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13
14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18 13
19/* 14/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -22,53 +17,54 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
23 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
24 */ 19 */
25SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
26 21
27/* 22/*
28 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
29 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
30 * touched, increases cache locality. 25 * touched, increases cache locality.
31 */ 26 */
32SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
33 28
34/* 29/*
35 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
36 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
37 * cache locality. 32 * cache locality.
38 */ 33 */
39SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
40 35
41/* 36/*
42 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
43 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
44 */ 39 */
45SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
46 41
47/* 42/*
48 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
49 */ 44 */
50SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
51 46
52SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
53SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
54SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
55 50
56/* 51/*
57 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
58 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
59 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
60 */ 55 */
61SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
62 57
63/* 58/*
64 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
65 */ 60 */
66SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
67 62
68/* 63/*
69 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */ 66 */
72SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
73 68
74SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index af1177858be3..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -124,21 +300,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 300 update_rt_migration(rt_rq);
125} 301}
126 302
303static inline int has_pushable_tasks(struct rq *rq)
304{
305 return !plist_head_empty(&rq->rt.pushable_tasks);
306}
307
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 308static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 309{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 310 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 311 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 312 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
313
314 /* Update the highest prio pushable task */
315 if (p->prio < rq->rt.highest_prio.next)
316 rq->rt.highest_prio.next = p->prio;
132} 317}
133 318
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 319static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 320{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 321 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 322
139static inline int has_pushable_tasks(struct rq *rq) 323 /* Update the new highest prio pushable task */
140{ 324 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 325 p = plist_first_entry(&rq->rt.pushable_tasks,
326 struct task_struct, pushable_tasks);
327 rq->rt.highest_prio.next = p->prio;
328 } else
329 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 330}
143 331
144#else 332#else
@@ -544,10 +732,35 @@ static void enable_runtime(struct rq *rq)
544 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
545} 733}
546 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
547static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
548{ 758{
549 int more = 0; 759 int more = 0;
550 760
761 if (!sched_feat(RT_RUNTIME_SHARE))
762 return more;
763
551 if (rt_rq->rt_time > rt_rq->rt_runtime) { 764 if (rt_rq->rt_time > rt_rq->rt_runtime) {
552 raw_spin_unlock(&rt_rq->rt_runtime_lock); 765 raw_spin_unlock(&rt_rq->rt_runtime_lock);
553 more = do_balance_runtime(rt_rq); 766 more = do_balance_runtime(rt_rq);
@@ -633,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
633 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
634 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
635 848
636 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
637 return 0; 850 return 0;
638 851
639 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -643,6 +856,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
643 856
644 if (rt_rq->rt_time > runtime) { 857 if (rt_rq->rt_time > runtime) {
645 rt_rq->rt_throttled = 1; 858 rt_rq->rt_throttled = 1;
859 printk_once(KERN_WARNING "sched: RT throttling activated\n");
646 if (rt_rq_throttled(rt_rq)) { 860 if (rt_rq_throttled(rt_rq)) {
647 sched_rt_rq_dequeue(rt_rq); 861 sched_rt_rq_dequeue(rt_rq);
648 return 1; 862 return 1;
@@ -698,47 +912,13 @@ static void update_curr_rt(struct rq *rq)
698 912
699#if defined CONFIG_SMP 913#if defined CONFIG_SMP
700 914
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 915static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 916inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 917{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 918 struct rq *rq = rq_of_rt_rq(rt_rq);
717 919
718 if (prio < prev_prio) { 920 if (rq->online && prio < prev_prio)
719 921 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 922}
743 923
744static void 924static void
@@ -746,9 +926,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 926{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 927 struct rq *rq = rq_of_rt_rq(rt_rq);
748 928
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 929 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 930 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 931}
@@ -961,6 +1138,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 1138
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1139 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 1140 enqueue_pushable_task(rq, p);
1141
1142 inc_nr_running(rq);
964} 1143}
965 1144
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1145static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,11 +1150,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 1150 dequeue_rt_entity(rt_se);
972 1151
973 dequeue_pushable_task(rq, p); 1152 dequeue_pushable_task(rq, p);
1153
1154 dec_nr_running(rq);
974} 1155}
975 1156
976/* 1157/*
977 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
978 * followed by enqueue. 1159 * dequeue followed by enqueue.
979 */ 1160 */
980static void 1161static void
981requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1017,10 +1198,15 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 1198 struct rq *rq;
1018 int cpu; 1199 int cpu;
1019 1200
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1206 /* For anything but wake ups, just return the task_cpu */
1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1208 goto out;
1209
1024 rq = cpu_rq(cpu); 1210 rq = cpu_rq(cpu);
1025 1211
1026 rcu_read_lock(); 1212 rcu_read_lock();
@@ -1059,6 +1245,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1245 }
1060 rcu_read_unlock(); 1246 rcu_read_unlock();
1061 1247
1248out:
1062 return cpu; 1249 return cpu;
1063} 1250}
1064 1251
@@ -1178,7 +1365,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1365static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1366{
1180 update_curr_rt(rq); 1367 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1368
1183 /* 1369 /*
1184 * The previous task needs to be made eligible for pushing 1370 * The previous task needs to be made eligible for pushing
@@ -1193,12 +1379,10 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1193/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1194#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1195 1381
1196static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1197
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{ 1383{
1200 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
1201 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1385 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1202 (p->rt.nr_cpus_allowed > 1)) 1386 (p->rt.nr_cpus_allowed > 1))
1203 return 1; 1387 return 1;
1204 return 0; 1388 return 0;
@@ -1343,7 +1527,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1343 */ 1527 */
1344 if (unlikely(task_rq(task) != rq || 1528 if (unlikely(task_rq(task) != rq ||
1345 !cpumask_test_cpu(lowest_rq->cpu, 1529 !cpumask_test_cpu(lowest_rq->cpu,
1346 &task->cpus_allowed) || 1530 tsk_cpus_allowed(task)) ||
1347 task_running(rq, task) || 1531 task_running(rq, task) ||
1348 !task->on_rq)) { 1532 !task->on_rq)) {
1349 1533
@@ -1394,6 +1578,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1578{
1395 struct task_struct *next_task; 1579 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1580 struct rq *lowest_rq;
1581 int ret = 0;
1397 1582
1398 if (!rq->rt.overloaded) 1583 if (!rq->rt.overloaded)
1399 return 0; 1584 return 0;
@@ -1426,7 +1611,7 @@ retry:
1426 if (!lowest_rq) { 1611 if (!lowest_rq) {
1427 struct task_struct *task; 1612 struct task_struct *task;
1428 /* 1613 /*
1429 * find lock_lowest_rq releases rq->lock 1614 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1615 * so it is possible that next_task has migrated.
1431 * 1616 *
1432 * We need to make sure that the task is still on the same 1617 * We need to make sure that the task is still on the same
@@ -1436,12 +1621,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1621 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1622 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1623 /*
1439 * If we get here, the task hasn't moved at all, but 1624 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1625 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1626 * to push it to. Do not retry in this case, since
1442 * are ready. 1627 * other cpus will pull from us when ready.
1443 */ 1628 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1629 goto out;
1446 } 1630 }
1447 1631
@@ -1460,6 +1644,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1644 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1645 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1646 activate_task(lowest_rq, next_task, 0);
1647 ret = 1;
1463 1648
1464 resched_task(lowest_rq->curr); 1649 resched_task(lowest_rq->curr);
1465 1650
@@ -1468,7 +1653,7 @@ retry:
1468out: 1653out:
1469 put_task_struct(next_task); 1654 put_task_struct(next_task);
1470 1655
1471 return 1; 1656 return ret;
1472} 1657}
1473 1658
1474static void push_rt_tasks(struct rq *rq) 1659static void push_rt_tasks(struct rq *rq)
@@ -1626,9 +1811,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1626 1811
1627 update_rt_migration(&rq->rt); 1812 update_rt_migration(&rq->rt);
1628 } 1813 }
1629
1630 cpumask_copy(&p->cpus_allowed, new_mask);
1631 p->rt.nr_cpus_allowed = weight;
1632} 1814}
1633 1815
1634/* Assumes rq->lock is held */ 1816/* Assumes rq->lock is held */
@@ -1670,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1670 pull_rt_task(rq); 1852 pull_rt_task(rq);
1671} 1853}
1672 1854
1673static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1674{ 1856{
1675 unsigned int i; 1857 unsigned int i;
1676 1858
1677 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1678 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1679 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1680} 1863}
1681#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1682 1865
@@ -1817,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1817 return 0; 2000 return 0;
1818} 2001}
1819 2002
1820static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1821 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1822 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1823 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1852,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1852#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1853extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1854 2037
1855static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1856{ 2039{
1857 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1858 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
@@ -1863,4 +2046,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 2046 rcu_read_unlock();
1864} 2047}
1865#endif /* CONFIG_SCHED_DEBUG */ 2048#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 331e01bcd026..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -282,10 +179,9 @@ static inline void account_group_user_time(struct task_struct *tsk,
282 if (!cputimer->running) 179 if (!cputimer->running)
283 return; 180 return;
284 181
285 spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime); 184 raw_spin_unlock(&cputimer->lock);
288 spin_unlock(&cputimer->lock);
289} 185}
290 186
291/** 187/**
@@ -306,10 +202,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
306 if (!cputimer->running) 202 if (!cputimer->running)
307 return; 203 return;
308 204
309 spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime); 207 raw_spin_unlock(&cputimer->lock);
312 spin_unlock(&cputimer->lock);
313} 208}
314 209
315/** 210/**
@@ -330,7 +225,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
330 if (!cputimer->running) 225 if (!cputimer->running)
331 return; 226 return;
332 227
333 spin_lock(&cputimer->lock); 228 raw_spin_lock(&cputimer->lock);
334 cputimer->cputime.sum_exec_runtime += ns; 229 cputimer->cputime.sum_exec_runtime += ns;
335 spin_unlock(&cputimer->lock); 230 raw_spin_unlock(&cputimer->lock);
336} 231}
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 6f437632afab..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -34,11 +36,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 36static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 37enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 38{
39 inc_nr_running(rq);
37} 40}
38 41
39static void 42static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 43dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 44{
45 dec_nr_running(rq);
42} 46}
43 47
44static void yield_task_stop(struct rq *rq) 48static void yield_task_stop(struct rq *rq)
@@ -78,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
78/* 82/*
79 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
80 */ 84 */
81static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
82 .next = &rt_sched_class, 86 .next = &rt_sched_class,
83 87
84 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..60636a4e25c3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
27 27
28#include <linux/compiler.h> 28#include <linux/compiler.h>
29#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
54{ 54{
55 unsigned long flags; 55 unsigned long flags;
56 56
57 spin_lock_irqsave(&sem->lock, flags); 57 raw_spin_lock_irqsave(&sem->lock, flags);
58 if (likely(sem->count > 0)) 58 if (likely(sem->count > 0))
59 sem->count--; 59 sem->count--;
60 else 60 else
61 __down(sem); 61 __down(sem);
62 spin_unlock_irqrestore(&sem->lock, flags); 62 raw_spin_unlock_irqrestore(&sem->lock, flags);
63} 63}
64EXPORT_SYMBOL(down); 64EXPORT_SYMBOL(down);
65 65
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
77 unsigned long flags; 77 unsigned long flags;
78 int result = 0; 78 int result = 0;
79 79
80 spin_lock_irqsave(&sem->lock, flags); 80 raw_spin_lock_irqsave(&sem->lock, flags);
81 if (likely(sem->count > 0)) 81 if (likely(sem->count > 0))
82 sem->count--; 82 sem->count--;
83 else 83 else
84 result = __down_interruptible(sem); 84 result = __down_interruptible(sem);
85 spin_unlock_irqrestore(&sem->lock, flags); 85 raw_spin_unlock_irqrestore(&sem->lock, flags);
86 86
87 return result; 87 return result;
88} 88}
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
103 unsigned long flags; 103 unsigned long flags;
104 int result = 0; 104 int result = 0;
105 105
106 spin_lock_irqsave(&sem->lock, flags); 106 raw_spin_lock_irqsave(&sem->lock, flags);
107 if (likely(sem->count > 0)) 107 if (likely(sem->count > 0))
108 sem->count--; 108 sem->count--;
109 else 109 else
110 result = __down_killable(sem); 110 result = __down_killable(sem);
111 spin_unlock_irqrestore(&sem->lock, flags); 111 raw_spin_unlock_irqrestore(&sem->lock, flags);
112 112
113 return result; 113 return result;
114} 114}
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
132 unsigned long flags; 132 unsigned long flags;
133 int count; 133 int count;
134 134
135 spin_lock_irqsave(&sem->lock, flags); 135 raw_spin_lock_irqsave(&sem->lock, flags);
136 count = sem->count - 1; 136 count = sem->count - 1;
137 if (likely(count >= 0)) 137 if (likely(count >= 0))
138 sem->count = count; 138 sem->count = count;
139 spin_unlock_irqrestore(&sem->lock, flags); 139 raw_spin_unlock_irqrestore(&sem->lock, flags);
140 140
141 return (count < 0); 141 return (count < 0);
142} 142}
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
159 159
160 spin_lock_irqsave(&sem->lock, flags); 160 raw_spin_lock_irqsave(&sem->lock, flags);
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, jiffies);
165 spin_unlock_irqrestore(&sem->lock, flags); 165 raw_spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
168} 168}
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
179{ 179{
180 unsigned long flags; 180 unsigned long flags;
181 181
182 spin_lock_irqsave(&sem->lock, flags); 182 raw_spin_lock_irqsave(&sem->lock, flags);
183 if (likely(list_empty(&sem->wait_list))) 183 if (likely(list_empty(&sem->wait_list)))
184 sem->count++; 184 sem->count++;
185 else 185 else
186 __up(sem); 186 __up(sem);
187 spin_unlock_irqrestore(&sem->lock, flags); 187 raw_spin_unlock_irqrestore(&sem->lock, flags);
188} 188}
189EXPORT_SYMBOL(up); 189EXPORT_SYMBOL(up);
190 190
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
217 if (timeout <= 0) 217 if (timeout <= 0)
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
221 timeout = schedule_timeout(timeout); 221 timeout = schedule_timeout(timeout);
222 spin_lock_irq(&sem->lock); 222 raw_spin_lock_irq(&sem->lock);
223 if (waiter.up) 223 if (waiter.up)
224 return 0; 224 return 0;
225 } 225 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be75..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,7 +11,7 @@
11 */ 11 */
12 12
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h>
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h> 33#include <trace/events/signal.h>
33 34
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1019 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1020} 1021}
1021 1022
1023/*
1024 * map the uid in struct cred into user namespace *ns
1025 */
1026static inline uid_t map_cred_ns(const struct cred *cred,
1027 struct user_namespace *ns)
1028{
1029 return user_ns_map_uid(ns, cred, cred->uid);
1030}
1031
1032#ifdef CONFIG_USER_NS
1033static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1034{
1035 if (current_user_ns() == task_cred_xxx(t, user_ns))
1036 return;
1037
1038 if (SI_FROMKERNEL(info))
1039 return;
1040
1041 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
1042 current_cred(), info->si_uid);
1043}
1044#else
1045static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1046{
1047 return;
1048}
1049#endif
1050
1022static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, 1051static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1023 int group, int from_ancestor_ns) 1052 int group, int from_ancestor_ns)
1024{ 1053{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1088 q->info.si_pid = 0; 1117 q->info.si_pid = 0;
1089 break; 1118 break;
1090 } 1119 }
1120
1121 userns_fixup_signal_uid(&q->info, t);
1122
1091 } else if (!is_si_special(info)) { 1123 } else if (!is_si_special(info)) {
1092 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1124 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
1093 /* 1125 /*
@@ -1344,13 +1376,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1344 return error; 1376 return error;
1345} 1377}
1346 1378
1379static int kill_as_cred_perm(const struct cred *cred,
1380 struct task_struct *target)
1381{
1382 const struct cred *pcred = __task_cred(target);
1383 if (cred->user_ns != pcred->user_ns)
1384 return 0;
1385 if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
1386 cred->uid != pcred->suid && cred->uid != pcred->uid)
1387 return 0;
1388 return 1;
1389}
1390
1347/* like kill_pid_info(), but doesn't use uid/euid of "current" */ 1391/* like kill_pid_info(), but doesn't use uid/euid of "current" */
1348int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, 1392int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
1349 uid_t uid, uid_t euid, u32 secid) 1393 const struct cred *cred, u32 secid)
1350{ 1394{
1351 int ret = -EINVAL; 1395 int ret = -EINVAL;
1352 struct task_struct *p; 1396 struct task_struct *p;
1353 const struct cred *pcred;
1354 unsigned long flags; 1397 unsigned long flags;
1355 1398
1356 if (!valid_signal(sig)) 1399 if (!valid_signal(sig))
@@ -1362,10 +1405,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1362 ret = -ESRCH; 1405 ret = -ESRCH;
1363 goto out_unlock; 1406 goto out_unlock;
1364 } 1407 }
1365 pcred = __task_cred(p); 1408 if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
1366 if (si_fromuser(info) &&
1367 euid != pcred->suid && euid != pcred->uid &&
1368 uid != pcred->suid && uid != pcred->uid) {
1369 ret = -EPERM; 1409 ret = -EPERM;
1370 goto out_unlock; 1410 goto out_unlock;
1371 } 1411 }
@@ -1384,7 +1424,7 @@ out_unlock:
1384 rcu_read_unlock(); 1424 rcu_read_unlock();
1385 return ret; 1425 return ret;
1386} 1426}
1387EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1427EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
1388 1428
1389/* 1429/*
1390 * kill_something_info() interprets pid in interesting ways just like kill(2). 1430 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1618,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1618 */ 1658 */
1619 rcu_read_lock(); 1659 rcu_read_lock();
1620 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1660 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1621 info.si_uid = __task_cred(tsk)->uid; 1661 info.si_uid = map_cred_ns(__task_cred(tsk),
1662 task_cred_xxx(tsk->parent, user_ns));
1622 rcu_read_unlock(); 1663 rcu_read_unlock();
1623 1664
1624 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1665 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1625 tsk->signal->utime)); 1666 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1626 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1627 tsk->signal->stime));
1628 1667
1629 info.si_status = tsk->exit_code & 0x7f; 1668 info.si_status = tsk->exit_code & 0x7f;
1630 if (tsk->exit_code & 0x80) 1669 if (tsk->exit_code & 0x80)
@@ -1703,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1703 */ 1742 */
1704 rcu_read_lock(); 1743 rcu_read_lock();
1705 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1744 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1706 info.si_uid = __task_cred(tsk)->uid; 1745 info.si_uid = map_cred_ns(__task_cred(tsk),
1746 task_cred_xxx(parent, user_ns));
1707 rcu_read_unlock(); 1747 rcu_read_unlock();
1708 1748
1709 info.si_utime = cputime_to_clock_t(tsk->utime); 1749 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1986,8 +2026,6 @@ static bool do_signal_stop(int signr)
1986 */ 2026 */
1987 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 2027 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1988 sig->group_exit_code = signr; 2028 sig->group_exit_code = signr;
1989 else
1990 WARN_ON_ONCE(!current->ptrace);
1991 2029
1992 sig->group_stop_count = 0; 2030 sig->group_stop_count = 0;
1993 2031
@@ -2121,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
2121 info->si_signo = signr; 2159 info->si_signo = signr;
2122 info->si_errno = 0; 2160 info->si_errno = 0;
2123 info->si_code = SI_USER; 2161 info->si_code = SI_USER;
2162 rcu_read_lock();
2124 info->si_pid = task_pid_vnr(current->parent); 2163 info->si_pid = task_pid_vnr(current->parent);
2125 info->si_uid = task_uid(current->parent); 2164 info->si_uid = map_cred_ns(__task_cred(current->parent),
2165 current_user_ns());
2166 rcu_read_unlock();
2126 } 2167 }
2127 2168
2128 /* If the (new) signal is now blocked, requeue it. */ 2169 /* If the (new) signal is now blocked, requeue it. */
@@ -2314,6 +2355,27 @@ relock:
2314 return signr; 2355 return signr;
2315} 2356}
2316 2357
2358/**
2359 * block_sigmask - add @ka's signal mask to current->blocked
2360 * @ka: action for @signr
2361 * @signr: signal that has been successfully delivered
2362 *
2363 * This function should be called when a signal has succesfully been
2364 * delivered. It adds the mask of signals for @ka to current->blocked
2365 * so that they are blocked during the execution of the signal
2366 * handler. In addition, @signr will be blocked unless %SA_NODEFER is
2367 * set in @ka->sa.sa_flags.
2368 */
2369void block_sigmask(struct k_sigaction *ka, int signr)
2370{
2371 sigset_t blocked;
2372
2373 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2374 if (!(ka->sa.sa_flags & SA_NODEFER))
2375 sigaddset(&blocked, signr);
2376 set_current_blocked(&blocked);
2377}
2378
2317/* 2379/*
2318 * It could be that complete_signal() picked us to notify about the 2380 * It could be that complete_signal() picked us to notify about the
2319 * group-wide signal. Other threads should be notified now to take 2381 * group-wide signal. Other threads should be notified now to take
@@ -2351,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
2351 int group_stop = 0; 2413 int group_stop = 0;
2352 sigset_t unblocked; 2414 sigset_t unblocked;
2353 2415
2416 /*
2417 * @tsk is about to have PF_EXITING set - lock out users which
2418 * expect stable threadgroup.
2419 */
2420 threadgroup_change_begin(tsk);
2421
2354 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2422 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2355 tsk->flags |= PF_EXITING; 2423 tsk->flags |= PF_EXITING;
2424 threadgroup_change_end(tsk);
2356 return; 2425 return;
2357 } 2426 }
2358 2427
@@ -2362,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
2362 * see wants_signal(), do_signal_stop(). 2431 * see wants_signal(), do_signal_stop().
2363 */ 2432 */
2364 tsk->flags |= PF_EXITING; 2433 tsk->flags |= PF_EXITING;
2434
2435 threadgroup_change_end(tsk);
2436
2365 if (!signal_pending(tsk)) 2437 if (!signal_pending(tsk))
2366 goto out; 2438 goto out;
2367 2439
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa8394e..db197d60489b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,7 +6,7 @@
6#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
7#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042b..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
10 * Remote softirq infrastructure is by Jens Axboe. 10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 11 */
12 12
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/init.h> 16#include <linux/init.h>
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index be6517fb9c14..84c7d96918bf 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/export.h>
23 23
24/* 24/*
25 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 73ce23feaea9..0febf61e1aa3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -24,7 +24,7 @@
24 * 24 *
25 */ 25 */
26 26
27#include <linux/module.h> 27#include <linux/export.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/percpu.h> 29#include <linux/percpu.h>
30#include <linux/preempt.h> 30#include <linux/preempt.h>
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index d20c6983aad9..00fe55cc5a82 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
13 13
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce5765..2f194e965715 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
@@ -41,6 +41,7 @@ struct cpu_stopper {
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44static bool stop_machine_initialized = false;
44 45
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{ 47{
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
386 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
387 register_cpu_notifier(&cpu_stop_cpu_notifier); 388 register_cpu_notifier(&cpu_stop_cpu_notifier);
388 389
390 stop_machine_initialized = true;
391
389 return 0; 392 return 0;
390} 393}
391early_initcall(cpu_stop_init); 394early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
485 .num_threads = num_online_cpus(), 488 .num_threads = num_online_cpus(),
486 .active_cpus = cpus }; 489 .active_cpus = cpus };
487 490
491 if (!stop_machine_initialized) {
492 /*
493 * Handle the case where stop_machine() is called
494 * early in boot before stop_machine() has been
495 * initialized.
496 */
497 unsigned long flags;
498 int ret;
499
500 WARN_ON_ONCE(smdata.num_threads != 1);
501
502 local_irq_save(flags);
503 hard_irq_disable();
504 ret = (*fn)(data);
505 local_irq_restore(flags);
506
507 return ret;
508 }
509
488 /* Set the initial state and stop all online cpus. */ 510 /* Set the initial state and stop all online cpus. */
489 set_state(&smdata, STOPMACHINE_PREPARE); 511 set_state(&smdata, STOPMACHINE_PREPARE);
490 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 512 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1dbbe695a5ef..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
@@ -12,6 +12,7 @@
12#include <linux/prctl.h> 12#include <linux/prctl.h>
13#include <linux/highuid.h> 13#include <linux/highuid.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/kmod.h>
15#include <linux/perf_event.h> 16#include <linux/perf_event.h>
16#include <linux/resource.h> 17#include <linux/resource.h>
17#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -1286,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1286 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1287 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1287 errno = 0; 1288 errno = 0;
1288 } 1289 }
1290 uts_proc_notify(UTS_PROC_HOSTNAME);
1289 up_write(&uts_sem); 1291 up_write(&uts_sem);
1290 return errno; 1292 return errno;
1291} 1293}
@@ -1336,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1336 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1338 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1337 errno = 0; 1339 errno = 0;
1338 } 1340 }
1341 uts_proc_notify(UTS_PROC_DOMAINNAME);
1339 up_write(&uts_sem); 1342 up_write(&uts_sem);
1340 return errno; 1343 return errno;
1341} 1344}
@@ -1602,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1602 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1603 1606
1604 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1605 utime = stime = cputime_zero; 1608 utime = stime = 0;
1606 1609
1607 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1608 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1632,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1632 1635
1633 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1634 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1635 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1636 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1637 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1638 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1639 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
@@ -1689,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
1689 return mask; 1692 return mask;
1690} 1693}
1691 1694
1695#ifdef CONFIG_CHECKPOINT_RESTORE
1696static int prctl_set_mm(int opt, unsigned long addr,
1697 unsigned long arg4, unsigned long arg5)
1698{
1699 unsigned long rlim = rlimit(RLIMIT_DATA);
1700 unsigned long vm_req_flags;
1701 unsigned long vm_bad_flags;
1702 struct vm_area_struct *vma;
1703 int error = 0;
1704 struct mm_struct *mm = current->mm;
1705
1706 if (arg4 | arg5)
1707 return -EINVAL;
1708
1709 if (!capable(CAP_SYS_ADMIN))
1710 return -EPERM;
1711
1712 if (addr >= TASK_SIZE)
1713 return -EINVAL;
1714
1715 down_read(&mm->mmap_sem);
1716 vma = find_vma(mm, addr);
1717
1718 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1719 /* It must be existing VMA */
1720 if (!vma || vma->vm_start > addr)
1721 goto out;
1722 }
1723
1724 error = -EINVAL;
1725 switch (opt) {
1726 case PR_SET_MM_START_CODE:
1727 case PR_SET_MM_END_CODE:
1728 vm_req_flags = VM_READ | VM_EXEC;
1729 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1730
1731 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1732 (vma->vm_flags & vm_bad_flags))
1733 goto out;
1734
1735 if (opt == PR_SET_MM_START_CODE)
1736 mm->start_code = addr;
1737 else
1738 mm->end_code = addr;
1739 break;
1740
1741 case PR_SET_MM_START_DATA:
1742 case PR_SET_MM_END_DATA:
1743 vm_req_flags = VM_READ | VM_WRITE;
1744 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1745
1746 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1747 (vma->vm_flags & vm_bad_flags))
1748 goto out;
1749
1750 if (opt == PR_SET_MM_START_DATA)
1751 mm->start_data = addr;
1752 else
1753 mm->end_data = addr;
1754 break;
1755
1756 case PR_SET_MM_START_STACK:
1757
1758#ifdef CONFIG_STACK_GROWSUP
1759 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1760#else
1761 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1762#endif
1763 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1764 goto out;
1765
1766 mm->start_stack = addr;
1767 break;
1768
1769 case PR_SET_MM_START_BRK:
1770 if (addr <= mm->end_data)
1771 goto out;
1772
1773 if (rlim < RLIM_INFINITY &&
1774 (mm->brk - addr) +
1775 (mm->end_data - mm->start_data) > rlim)
1776 goto out;
1777
1778 mm->start_brk = addr;
1779 break;
1780
1781 case PR_SET_MM_BRK:
1782 if (addr <= mm->end_data)
1783 goto out;
1784
1785 if (rlim < RLIM_INFINITY &&
1786 (addr - mm->start_brk) +
1787 (mm->end_data - mm->start_data) > rlim)
1788 goto out;
1789
1790 mm->brk = addr;
1791 break;
1792
1793 default:
1794 error = -EINVAL;
1795 goto out;
1796 }
1797
1798 error = 0;
1799
1800out:
1801 up_read(&mm->mmap_sem);
1802
1803 return error;
1804}
1805#else /* CONFIG_CHECKPOINT_RESTORE */
1806static int prctl_set_mm(int opt, unsigned long addr,
1807 unsigned long arg4, unsigned long arg5)
1808{
1809 return -EINVAL;
1810}
1811#endif
1812
1692SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1813SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1693 unsigned long, arg4, unsigned long, arg5) 1814 unsigned long, arg4, unsigned long, arg5)
1694{ 1815{
@@ -1759,6 +1880,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1759 sizeof(me->comm) - 1) < 0) 1880 sizeof(me->comm) - 1) < 0)
1760 return -EFAULT; 1881 return -EFAULT;
1761 set_task_comm(me, comm); 1882 set_task_comm(me, comm);
1883 proc_comm_connector(me);
1762 return 0; 1884 return 0;
1763 case PR_GET_NAME: 1885 case PR_GET_NAME:
1764 get_task_comm(comm, me); 1886 get_task_comm(comm, me);
@@ -1837,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1837 else 1959 else
1838 error = PR_MCE_KILL_DEFAULT; 1960 error = PR_MCE_KILL_DEFAULT;
1839 break; 1961 break;
1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break;
1840 default: 1965 default:
1841 error = -EINVAL; 1966 error = -EINVAL;
1842 break; 1967 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a9a5de07c4f1..47bfa16430d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit);
145cond_syscall(sys_io_cancel); 145cond_syscall(sys_io_cancel);
146cond_syscall(sys_io_getevents); 146cond_syscall(sys_io_getevents);
147cond_syscall(sys_syslog); 147cond_syscall(sys_syslog);
148cond_syscall(sys_process_vm_readv);
149cond_syscall(sys_process_vm_writev);
150cond_syscall(compat_sys_process_vm_readv);
151cond_syscall(compat_sys_process_vm_writev);
148 152
149/* arch-specific weak syscall entries */ 153/* arch-specific weak syscall entries */
150cond_syscall(sys_pciconfig_read); 154cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -57,6 +57,7 @@
57#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60#include <linux/capability.h>
60 61
61#include <asm/uaccess.h> 62#include <asm/uaccess.h>
62#include <asm/processor.h> 63#include <asm/processor.h>
@@ -134,6 +135,7 @@ static int minolduid;
134static int min_percpu_pagelist_fract = 8; 135static int min_percpu_pagelist_fract = 8;
135 136
136static int ngroups_max = NGROUPS_MAX; 137static int ngroups_max = NGROUPS_MAX;
138static const int cap_last_cap = CAP_LAST_CAP;
137 139
138#ifdef CONFIG_INOTIFY_USER 140#ifdef CONFIG_INOTIFY_USER
139#include <linux/inotify.h> 141#include <linux/inotify.h>
@@ -151,14 +153,6 @@ extern int pwrsw_enabled;
151extern int unaligned_enabled; 153extern int unaligned_enabled;
152#endif 154#endif
153 155
154#ifdef CONFIG_S390
155#ifdef CONFIG_MATHEMU
156extern int sysctl_ieee_emulation_warnings;
157#endif
158extern int sysctl_userprocess_debug;
159extern int spin_retry;
160#endif
161
162#ifdef CONFIG_IA64 156#ifdef CONFIG_IA64
163extern int no_unaligned_warning; 157extern int no_unaligned_warning;
164extern int unaligned_dump_stack; 158extern int unaligned_dump_stack;
@@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 373 .extra2 = &one,
380 }, 374 },
381#endif 375#endif
376#ifdef CONFIG_CFS_BANDWIDTH
377 {
378 .procname = "sched_cfs_bandwidth_slice_us",
379 .data = &sysctl_sched_cfs_bandwidth_slice,
380 .maxlen = sizeof(unsigned int),
381 .mode = 0644,
382 .proc_handler = proc_dointvec_minmax,
383 .extra1 = &one,
384 },
385#endif
382#ifdef CONFIG_PROVE_LOCKING 386#ifdef CONFIG_PROVE_LOCKING
383 { 387 {
384 .procname = "prove_locking", 388 .procname = "prove_locking",
@@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = {
730 .mode = 0444, 734 .mode = 0444,
731 .proc_handler = proc_dointvec, 735 .proc_handler = proc_dointvec,
732 }, 736 },
737 {
738 .procname = "cap_last_cap",
739 .data = (void *)&cap_last_cap,
740 .maxlen = sizeof(int),
741 .mode = 0444,
742 .proc_handler = proc_dointvec,
743 },
733#if defined(CONFIG_LOCKUP_DETECTOR) 744#if defined(CONFIG_LOCKUP_DETECTOR)
734 { 745 {
735 .procname = "watchdog", 746 .procname = "watchdog",
@@ -792,6 +803,15 @@ static struct ctl_table kern_table[] = {
792 .mode = 0644, 803 .mode = 0644,
793 .proc_handler = proc_dointvec, 804 .proc_handler = proc_dointvec,
794 }, 805 },
806#ifdef CONFIG_DEBUG_STACKOVERFLOW
807 {
808 .procname = "panic_on_stackoverflow",
809 .data = &sysctl_panic_on_stackoverflow,
810 .maxlen = sizeof(int),
811 .mode = 0644,
812 .proc_handler = proc_dointvec,
813 },
814#endif
795 { 815 {
796 .procname = "bootloader_type", 816 .procname = "bootloader_type",
797 .data = &bootloader_type, 817 .data = &bootloader_type,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8bffbe2ba4b..a650694883a1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, 214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, 215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, 216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
217 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, 217 /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, 218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, 219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, 220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1354 1354
1355 fput(file); 1355 fput(file);
1356out_putname: 1356out_putname:
1357 putname(pathname); 1357 __putname(pathname);
1358out: 1358out:
1359 return result; 1359 return result;
1360} 1360}
diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c9..73e416db0a1e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,7 +27,7 @@
27 * with nanosecond accuracy 27 * with nanosecond accuracy
28 */ 28 */
29 29
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h> 33#include <linux/clocksource.h>
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
575/* 575/*
576 * Convert jiffies/jiffies_64 to clock_t and back. 576 * Convert jiffies/jiffies_64 to clock_t and back.
577 */ 577 */
578clock_t jiffies_to_clock_t(long x) 578clock_t jiffies_to_clock_t(unsigned long x)
579{ 579{
580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
581# if HZ < USER_HZ 581# if HZ < USER_HZ
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f06a8a365648..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,5 +25,7 @@ config HIGH_RES_TIMERS
25config GENERIC_CLOCKEVENTS_BUILD 25config GENERIC_CLOCKEVENTS_BUILD
26 bool 26 bool
27 default y 27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR 28 depends on GENERIC_CLOCKEVENTS
29 29
30config GENERIC_CLOCKEVENTS_MIN_ADJUST
31 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ea5e1a928d5b..8a46f5d64504 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -53,27 +53,6 @@ static struct rtc_device *rtcdev;
53static DEFINE_SPINLOCK(rtcdev_lock); 53static DEFINE_SPINLOCK(rtcdev_lock);
54 54
55/** 55/**
56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
59 *
60 * This helper function checks to see if the rtc device can wake
61 * from suspend.
62 */
63static int has_wakealarm(struct device *dev, void *name_ptr)
64{
65 struct rtc_device *candidate = to_rtc_device(dev);
66
67 if (!candidate->ops->set_alarm)
68 return 0;
69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71
72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
74}
75
76/**
77 * alarmtimer_get_rtcdev - Return selected rtcdevice 56 * alarmtimer_get_rtcdev - Return selected rtcdevice
78 * 57 *
79 * This function returns the rtc device to use for wakealarms. 58 * This function returns the rtc device to use for wakealarms.
@@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr)
82 */ 61 */
83static struct rtc_device *alarmtimer_get_rtcdev(void) 62static struct rtc_device *alarmtimer_get_rtcdev(void)
84{ 63{
85 struct device *dev;
86 char *str;
87 unsigned long flags; 64 unsigned long flags;
88 struct rtc_device *ret; 65 struct rtc_device *ret;
89 66
90 spin_lock_irqsave(&rtcdev_lock, flags); 67 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) {
92 /* Find an rtc device and init the rtc_timer */
93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
104 }
105 ret = rtcdev; 68 ret = rtcdev;
106 spin_unlock_irqrestore(&rtcdev_lock, flags); 69 spin_unlock_irqrestore(&rtcdev_lock, flags);
107 70
108 return ret; 71 return ret;
109} 72}
73
74
75static int alarmtimer_rtc_add_device(struct device *dev,
76 struct class_interface *class_intf)
77{
78 unsigned long flags;
79 struct rtc_device *rtc = to_rtc_device(dev);
80
81 if (rtcdev)
82 return -EBUSY;
83
84 if (!rtc->ops->set_alarm)
85 return -1;
86 if (!device_may_wakeup(rtc->dev.parent))
87 return -1;
88
89 spin_lock_irqsave(&rtcdev_lock, flags);
90 if (!rtcdev) {
91 rtcdev = rtc;
92 /* hold a reference so it doesn't go away */
93 get_device(dev);
94 }
95 spin_unlock_irqrestore(&rtcdev_lock, flags);
96 return 0;
97}
98
99static struct class_interface alarmtimer_rtc_interface = {
100 .add_dev = &alarmtimer_rtc_add_device,
101};
102
103static int alarmtimer_rtc_interface_setup(void)
104{
105 alarmtimer_rtc_interface.class = rtc_class;
106 return class_interface_register(&alarmtimer_rtc_interface);
107}
108static void alarmtimer_rtc_interface_remove(void)
109{
110 class_interface_unregister(&alarmtimer_rtc_interface);
111}
110#else 112#else
111#define alarmtimer_get_rtcdev() (0) 113static inline struct rtc_device *alarmtimer_get_rtcdev(void)
112#define rtcdev (0) 114{
115 return NULL;
116}
117#define rtcdev (NULL)
118static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
119static inline void alarmtimer_rtc_interface_remove(void) { }
113#endif 120#endif
114 121
115
116/** 122/**
117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue 123 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
118 * @base: pointer to the base where the timer is being run 124 * @base: pointer to the base where the timer is being run
@@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void)
126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 132static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
127{ 133{
128 timerqueue_add(&base->timerqueue, &alarm->node); 134 timerqueue_add(&base->timerqueue, &alarm->node);
135 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
136
129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { 137 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer); 138 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires, 139 hrtimer_start(&base->timer, alarm->node.expires,
@@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
147{ 155{
148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); 156 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
149 157
158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
159 return;
160
150 timerqueue_del(&base->timerqueue, &alarm->node); 161 timerqueue_del(&base->timerqueue, &alarm->node);
162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
163
151 if (next == &alarm->node) { 164 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer); 165 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue); 166 next = timerqueue_getnext(&base->timerqueue);
@@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
174 unsigned long flags; 187 unsigned long flags;
175 ktime_t now; 188 ktime_t now;
176 int ret = HRTIMER_NORESTART; 189 int ret = HRTIMER_NORESTART;
190 int restart = ALARMTIMER_NORESTART;
177 191
178 spin_lock_irqsave(&base->lock, flags); 192 spin_lock_irqsave(&base->lock, flags);
179 now = base->gettime(); 193 now = base->gettime();
@@ -181,23 +195,25 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
181 struct alarm *alarm; 195 struct alarm *alarm;
182 ktime_t expired = next->expires; 196 ktime_t expired = next->expires;
183 197
184 if (expired.tv64 >= now.tv64) 198 if (expired.tv64 > now.tv64)
185 break; 199 break;
186 200
187 alarm = container_of(next, struct alarm, node); 201 alarm = container_of(next, struct alarm, node);
188 202
189 timerqueue_del(&base->timerqueue, &alarm->node); 203 timerqueue_del(&base->timerqueue, &alarm->node);
190 alarm->enabled = 0; 204 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
191 /* Re-add periodic timers */ 205
192 if (alarm->period.tv64) { 206 alarm->state |= ALARMTIMER_STATE_CALLBACK;
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags); 207 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function) 208 if (alarm->function)
199 alarm->function(alarm); 209 restart = alarm->function(alarm, now);
200 spin_lock_irqsave(&base->lock, flags); 210 spin_lock_irqsave(&base->lock, flags);
211 alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
212
213 if (restart != ALARMTIMER_NORESTART) {
214 timerqueue_add(&base->timerqueue, &alarm->node);
215 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
216 }
201 } 217 }
202 218
203 if (next) { 219 if (next) {
@@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev)
234 freezer_delta = ktime_set(0, 0); 250 freezer_delta = ktime_set(0, 0);
235 spin_unlock_irqrestore(&freezer_delta_lock, flags); 251 spin_unlock_irqrestore(&freezer_delta_lock, flags);
236 252
237 rtc = rtcdev; 253 rtc = alarmtimer_get_rtcdev();
238 /* If we have no rtcdev, just return */ 254 /* If we have no rtcdev, just return */
239 if (!rtc) 255 if (!rtc)
240 return 0; 256 return 0;
@@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
299 * @function: callback that is run when the alarm fires 315 * @function: callback that is run when the alarm fires
300 */ 316 */
301void alarm_init(struct alarm *alarm, enum alarmtimer_type type, 317void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
302 void (*function)(struct alarm *)) 318 enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
303{ 319{
304 timerqueue_init(&alarm->node); 320 timerqueue_init(&alarm->node);
305 alarm->period = ktime_set(0, 0);
306 alarm->function = function; 321 alarm->function = function;
307 alarm->type = type; 322 alarm->type = type;
308 alarm->enabled = 0; 323 alarm->state = ALARMTIMER_STATE_INACTIVE;
309} 324}
310 325
311/** 326/**
312 * alarm_start - Sets an alarm to fire 327 * alarm_start - Sets an alarm to fire
313 * @alarm: ptr to alarm to set 328 * @alarm: ptr to alarm to set
314 * @start: time to run the alarm 329 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
316 */ 330 */
317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) 331void alarm_start(struct alarm *alarm, ktime_t start)
318{ 332{
319 struct alarm_base *base = &alarm_bases[alarm->type]; 333 struct alarm_base *base = &alarm_bases[alarm->type];
320 unsigned long flags; 334 unsigned long flags;
321 335
322 spin_lock_irqsave(&base->lock, flags); 336 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled) 337 if (alarmtimer_active(alarm))
324 alarmtimer_remove(base, alarm); 338 alarmtimer_remove(base, alarm);
325 alarm->node.expires = start; 339 alarm->node.expires = start;
326 alarm->period = period;
327 alarmtimer_enqueue(base, alarm); 340 alarmtimer_enqueue(base, alarm);
328 alarm->enabled = 1;
329 spin_unlock_irqrestore(&base->lock, flags); 341 spin_unlock_irqrestore(&base->lock, flags);
330} 342}
331 343
332/** 344/**
333 * alarm_cancel - Tries to cancel an alarm timer 345 * alarm_try_to_cancel - Tries to cancel an alarm timer
334 * @alarm: ptr to alarm to be canceled 346 * @alarm: ptr to alarm to be canceled
347 *
348 * Returns 1 if the timer was canceled, 0 if it was not running,
349 * and -1 if the callback was running
335 */ 350 */
336void alarm_cancel(struct alarm *alarm) 351int alarm_try_to_cancel(struct alarm *alarm)
337{ 352{
338 struct alarm_base *base = &alarm_bases[alarm->type]; 353 struct alarm_base *base = &alarm_bases[alarm->type];
339 unsigned long flags; 354 unsigned long flags;
340 355 int ret = -1;
341 spin_lock_irqsave(&base->lock, flags); 356 spin_lock_irqsave(&base->lock, flags);
342 if (alarm->enabled) 357
358 if (alarmtimer_callback_running(alarm))
359 goto out;
360
361 if (alarmtimer_is_queued(alarm)) {
343 alarmtimer_remove(base, alarm); 362 alarmtimer_remove(base, alarm);
344 alarm->enabled = 0; 363 ret = 1;
364 } else
365 ret = 0;
366out:
345 spin_unlock_irqrestore(&base->lock, flags); 367 spin_unlock_irqrestore(&base->lock, flags);
368 return ret;
369}
370
371
372/**
373 * alarm_cancel - Spins trying to cancel an alarm timer until it is done
374 * @alarm: ptr to alarm to be canceled
375 *
376 * Returns 1 if the timer was canceled, 0 if it was not active.
377 */
378int alarm_cancel(struct alarm *alarm)
379{
380 for (;;) {
381 int ret = alarm_try_to_cancel(alarm);
382 if (ret >= 0)
383 return ret;
384 cpu_relax();
385 }
386}
387
388
389u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
390{
391 u64 overrun = 1;
392 ktime_t delta;
393
394 delta = ktime_sub(now, alarm->node.expires);
395
396 if (delta.tv64 < 0)
397 return 0;
398
399 if (unlikely(delta.tv64 >= interval.tv64)) {
400 s64 incr = ktime_to_ns(interval);
401
402 overrun = ktime_divns(delta, incr);
403
404 alarm->node.expires = ktime_add_ns(alarm->node.expires,
405 incr*overrun);
406
407 if (alarm->node.expires.tv64 > now.tv64)
408 return overrun;
409 /*
410 * This (and the ktime_add() below) is the
411 * correction for exact:
412 */
413 overrun++;
414 }
415
416 alarm->node.expires = ktime_add(alarm->node.expires, interval);
417 return overrun;
346} 418}
347 419
348 420
421
422
349/** 423/**
350 * clock2alarm - helper that converts from clockid to alarmtypes 424 * clock2alarm - helper that converts from clockid to alarmtypes
351 * @clockid: clockid. 425 * @clockid: clockid.
@@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
365 * 439 *
366 * Posix timer callback for expired alarm timers. 440 * Posix timer callback for expired alarm timers.
367 */ 441 */
368static void alarm_handle_timer(struct alarm *alarm) 442static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
443 ktime_t now)
369{ 444{
370 struct k_itimer *ptr = container_of(alarm, struct k_itimer, 445 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
371 it.alarmtimer); 446 it.alarm.alarmtimer);
372 if (posix_timer_event(ptr, 0) != 0) 447 if (posix_timer_event(ptr, 0) != 0)
373 ptr->it_overrun++; 448 ptr->it_overrun++;
449
450 /* Re-add periodic timers */
451 if (ptr->it.alarm.interval.tv64) {
452 ptr->it_overrun += alarm_forward(alarm, now,
453 ptr->it.alarm.interval);
454 return ALARMTIMER_RESTART;
455 }
456 return ALARMTIMER_NORESTART;
374} 457}
375 458
376/** 459/**
@@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
427 510
428 type = clock2alarm(new_timer->it_clock); 511 type = clock2alarm(new_timer->it_clock);
429 base = &alarm_bases[type]; 512 base = &alarm_bases[type];
430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); 513 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
431 return 0; 514 return 0;
432} 515}
433 516
@@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr,
444 memset(cur_setting, 0, sizeof(struct itimerspec)); 527 memset(cur_setting, 0, sizeof(struct itimerspec));
445 528
446 cur_setting->it_interval = 529 cur_setting->it_interval =
447 ktime_to_timespec(timr->it.alarmtimer.period); 530 ktime_to_timespec(timr->it.alarm.interval);
448 cur_setting->it_value = 531 cur_setting->it_value =
449 ktime_to_timespec(timr->it.alarmtimer.node.expires); 532 ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
450 return; 533 return;
451} 534}
452 535
@@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr)
461 if (!rtcdev) 544 if (!rtcdev)
462 return -ENOTSUPP; 545 return -ENOTSUPP;
463 546
464 alarm_cancel(&timr->it.alarmtimer); 547 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
548 return TIMER_RETRY;
549
465 return 0; 550 return 0;
466} 551}
467 552
@@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
481 if (!rtcdev) 566 if (!rtcdev)
482 return -ENOTSUPP; 567 return -ENOTSUPP;
483 568
484 /*
485 * XXX HACK! Currently we can DOS a system if the interval
486 * period on alarmtimers is too small. Cap the interval here
487 * to 100us and solve this properly in a future patch! -jstultz
488 */
489 if ((new_setting->it_interval.tv_sec == 0) &&
490 (new_setting->it_interval.tv_nsec < 100000))
491 new_setting->it_interval.tv_nsec = 100000;
492
493 if (old_setting) 569 if (old_setting)
494 alarm_timer_get(timr, old_setting); 570 alarm_timer_get(timr, old_setting);
495 571
496 /* If the timer was already set, cancel it */ 572 /* If the timer was already set, cancel it */
497 alarm_cancel(&timr->it.alarmtimer); 573 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
574 return TIMER_RETRY;
498 575
499 /* start the timer */ 576 /* start the timer */
500 alarm_start(&timr->it.alarmtimer, 577 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
501 timespec_to_ktime(new_setting->it_value), 578 alarm_start(&timr->it.alarm.alarmtimer,
502 timespec_to_ktime(new_setting->it_interval)); 579 timespec_to_ktime(new_setting->it_value));
503 return 0; 580 return 0;
504} 581}
505 582
@@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
509 * 586 *
510 * Wakes up the task that set the alarmtimer 587 * Wakes up the task that set the alarmtimer
511 */ 588 */
512static void alarmtimer_nsleep_wakeup(struct alarm *alarm) 589static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
590 ktime_t now)
513{ 591{
514 struct task_struct *task = (struct task_struct *)alarm->data; 592 struct task_struct *task = (struct task_struct *)alarm->data;
515 593
516 alarm->data = NULL; 594 alarm->data = NULL;
517 if (task) 595 if (task)
518 wake_up_process(task); 596 wake_up_process(task);
597 return ALARMTIMER_NORESTART;
519} 598}
520 599
521/** 600/**
@@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
530 alarm->data = (void *)current; 609 alarm->data = (void *)current;
531 do { 610 do {
532 set_current_state(TASK_INTERRUPTIBLE); 611 set_current_state(TASK_INTERRUPTIBLE);
533 alarm_start(alarm, absexp, ktime_set(0, 0)); 612 alarm_start(alarm, absexp);
534 if (likely(alarm->data)) 613 if (likely(alarm->data))
535 schedule(); 614 schedule();
536 615
@@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = {
691 */ 770 */
692static int __init alarmtimer_init(void) 771static int __init alarmtimer_init(void)
693{ 772{
773 struct platform_device *pdev;
694 int error = 0; 774 int error = 0;
695 int i; 775 int i;
696 struct k_clock alarm_clock = { 776 struct k_clock alarm_clock = {
@@ -719,10 +799,26 @@ static int __init alarmtimer_init(void)
719 HRTIMER_MODE_ABS); 799 HRTIMER_MODE_ABS);
720 alarm_bases[i].timer.function = alarmtimer_fired; 800 alarm_bases[i].timer.function = alarmtimer_fired;
721 } 801 }
802
803 error = alarmtimer_rtc_interface_setup();
804 if (error)
805 return error;
806
722 error = platform_driver_register(&alarmtimer_driver); 807 error = platform_driver_register(&alarmtimer_driver);
723 platform_device_register_simple("alarmtimer", -1, NULL, 0); 808 if (error)
809 goto out_if;
724 810
811 pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
812 if (IS_ERR(pdev)) {
813 error = PTR_ERR(pdev);
814 goto out_drv;
815 }
816 return 0;
817
818out_drv:
819 platform_driver_unregister(&alarmtimer_driver);
820out_if:
821 alarmtimer_rtc_interface_remove();
725 return error; 822 return error;
726} 823}
727device_initcall(alarmtimer_init); 824device_initcall(alarmtimer_init);
728
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e4c699dfa4e8..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h>
21 20
22#include "tick-internal.h" 21#include "tick-internal.h"
23 22
@@ -94,42 +93,143 @@ void clockevents_shutdown(struct clock_event_device *dev)
94 dev->next_event.tv64 = KTIME_MAX; 93 dev->next_event.tv64 = KTIME_MAX;
95} 94}
96 95
96#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
97
98/* Limit min_delta to a jiffie */
99#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
100
101/**
102 * clockevents_increase_min_delta - raise minimum delta of a clock event device
103 * @dev: device to increase the minimum delta
104 *
105 * Returns 0 on success, -ETIME when the minimum delta reached the limit.
106 */
107static int clockevents_increase_min_delta(struct clock_event_device *dev)
108{
109 /* Nothing to do if we already reached the limit */
110 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
111 printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
112 dev->next_event.tv64 = KTIME_MAX;
113 return -ETIME;
114 }
115
116 if (dev->min_delta_ns < 5000)
117 dev->min_delta_ns = 5000;
118 else
119 dev->min_delta_ns += dev->min_delta_ns >> 1;
120
121 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
122 dev->min_delta_ns = MIN_DELTA_LIMIT;
123
124 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
125 dev->name ? dev->name : "?",
126 (unsigned long long) dev->min_delta_ns);
127 return 0;
128}
129
130/**
131 * clockevents_program_min_delta - Set clock event device to the minimum delay.
132 * @dev: device to program
133 *
134 * Returns 0 on success, -ETIME when the retry loop failed.
135 */
136static int clockevents_program_min_delta(struct clock_event_device *dev)
137{
138 unsigned long long clc;
139 int64_t delta;
140 int i;
141
142 for (i = 0;;) {
143 delta = dev->min_delta_ns;
144 dev->next_event = ktime_add_ns(ktime_get(), delta);
145
146 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
147 return 0;
148
149 dev->retries++;
150 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
151 if (dev->set_next_event((unsigned long) clc, dev) == 0)
152 return 0;
153
154 if (++i > 2) {
155 /*
156 * We tried 3 times to program the device with the
157 * given min_delta_ns. Try to increase the minimum
158 * delta, if that fails as well get out of here.
159 */
160 if (clockevents_increase_min_delta(dev))
161 return -ETIME;
162 i = 0;
163 }
164 }
165}
166
167#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
168
169/**
170 * clockevents_program_min_delta - Set clock event device to the minimum delay.
171 * @dev: device to program
172 *
173 * Returns 0 on success, -ETIME when the retry loop failed.
174 */
175static int clockevents_program_min_delta(struct clock_event_device *dev)
176{
177 unsigned long long clc;
178 int64_t delta;
179
180 delta = dev->min_delta_ns;
181 dev->next_event = ktime_add_ns(ktime_get(), delta);
182
183 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
184 return 0;
185
186 dev->retries++;
187 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
188 return dev->set_next_event((unsigned long) clc, dev);
189}
190
191#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
192
97/** 193/**
98 * clockevents_program_event - Reprogram the clock event device. 194 * clockevents_program_event - Reprogram the clock event device.
195 * @dev: device to program
99 * @expires: absolute expiry time (monotonic clock) 196 * @expires: absolute expiry time (monotonic clock)
197 * @force: program minimum delay if expires can not be set
100 * 198 *
101 * Returns 0 on success, -ETIME when the event is in the past. 199 * Returns 0 on success, -ETIME when the event is in the past.
102 */ 200 */
103int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, 201int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
104 ktime_t now) 202 bool force)
105{ 203{
106 unsigned long long clc; 204 unsigned long long clc;
107 int64_t delta; 205 int64_t delta;
206 int rc;
108 207
109 if (unlikely(expires.tv64 < 0)) { 208 if (unlikely(expires.tv64 < 0)) {
110 WARN_ON_ONCE(1); 209 WARN_ON_ONCE(1);
111 return -ETIME; 210 return -ETIME;
112 } 211 }
113 212
114 delta = ktime_to_ns(ktime_sub(expires, now));
115
116 if (delta <= 0)
117 return -ETIME;
118
119 dev->next_event = expires; 213 dev->next_event = expires;
120 214
121 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 215 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
122 return 0; 216 return 0;
123 217
124 if (delta > dev->max_delta_ns) 218 /* Shortcut for clockevent devices that can deal with ktime. */
125 delta = dev->max_delta_ns; 219 if (dev->features & CLOCK_EVT_FEAT_KTIME)
126 if (delta < dev->min_delta_ns) 220 return dev->set_next_ktime(expires, dev);
127 delta = dev->min_delta_ns; 221
222 delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
223 if (delta <= 0)
224 return force ? clockevents_program_min_delta(dev) : -ETIME;
128 225
129 clc = delta * dev->mult; 226 delta = min(delta, (int64_t) dev->max_delta_ns);
130 clc >>= dev->shift; 227 delta = max(delta, (int64_t) dev->min_delta_ns);
131 228
132 return dev->set_next_event((unsigned long) clc, dev); 229 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
230 rc = dev->set_next_event((unsigned long) clc, dev);
231
232 return (rc && force) ? clockevents_program_min_delta(dev) : rc;
133} 233}
134 234
135/** 235/**
@@ -258,7 +358,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 358 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
259 return 0; 359 return 0;
260 360
261 return clockevents_program_event(dev, dev->next_event, ktime_get()); 361 return clockevents_program_event(dev, dev->next_event, false);
262} 362}
263 363
264/* 364/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0a..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 */ 24 */
25 25
26#include <linux/device.h>
26#include <linux/clocksource.h> 27#include <linux/clocksource.h>
27#include <linux/sysdev.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer;
186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
187static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
188static int watchdog_running; 188static int watchdog_running;
189static atomic_t watchdog_reset_pending;
189 190
190static int clocksource_watchdog_kthread(void *data); 191static int clocksource_watchdog_kthread(void *data);
191static void __clocksource_change_rating(struct clocksource *cs, int rating); 192static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data)
247 struct clocksource *cs; 248 struct clocksource *cs;
248 cycle_t csnow, wdnow; 249 cycle_t csnow, wdnow;
249 int64_t wd_nsec, cs_nsec; 250 int64_t wd_nsec, cs_nsec;
250 int next_cpu; 251 int next_cpu, reset_pending;
251 252
252 spin_lock(&watchdog_lock); 253 spin_lock(&watchdog_lock);
253 if (!watchdog_running) 254 if (!watchdog_running)
254 goto out; 255 goto out;
255 256
257 reset_pending = atomic_read(&watchdog_reset_pending);
258
256 list_for_each_entry(cs, &watchdog_list, wd_list) { 259 list_for_each_entry(cs, &watchdog_list, wd_list) {
257 260
258 /* Clocksource already marked unstable? */ 261 /* Clocksource already marked unstable? */
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data)
268 local_irq_enable(); 271 local_irq_enable();
269 272
270 /* Clocksource initialized ? */ 273 /* Clocksource initialized ? */
271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 274 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
275 atomic_read(&watchdog_reset_pending)) {
272 cs->flags |= CLOCK_SOURCE_WATCHDOG; 276 cs->flags |= CLOCK_SOURCE_WATCHDOG;
273 cs->wd_last = wdnow; 277 cs->wd_last = wdnow;
274 cs->cs_last = csnow; 278 cs->cs_last = csnow;
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data)
283 cs->cs_last = csnow; 287 cs->cs_last = csnow;
284 cs->wd_last = wdnow; 288 cs->wd_last = wdnow;
285 289
290 if (atomic_read(&watchdog_reset_pending))
291 continue;
292
286 /* Check the deviation from the watchdog clocksource. */ 293 /* Check the deviation from the watchdog clocksource. */
287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { 294 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
288 clocksource_unstable(cs, cs_nsec - wd_nsec); 295 clocksource_unstable(cs, cs_nsec - wd_nsec);
289 continue; 296 continue;
290 } 297 }
@@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data)
303 } 310 }
304 311
305 /* 312 /*
313 * We only clear the watchdog_reset_pending, when we did a
314 * full cycle through all clocksources.
315 */
316 if (reset_pending)
317 atomic_dec(&watchdog_reset_pending);
318
319 /*
306 * Cycle through CPUs to check if the CPUs stay synchronized 320 * Cycle through CPUs to check if the CPUs stay synchronized
307 * to each other. 321 * to each other.
308 */ 322 */
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void)
344 358
345static void clocksource_resume_watchdog(void) 359static void clocksource_resume_watchdog(void)
346{ 360{
347 unsigned long flags; 361 atomic_inc(&watchdog_reset_pending);
348
349 /*
350 * We use trylock here to avoid a potential dead lock when
351 * kgdb calls this code after the kernel has been stopped with
352 * watchdog_lock held. When watchdog_lock is held we just
353 * return and accept, that the watchdog might trigger and mark
354 * the monitored clock source (usually TSC) unstable.
355 *
356 * This does not affect the other caller clocksource_resume()
357 * because at this point the kernel is UP, interrupts are
358 * disabled and nothing can hold watchdog_lock.
359 */
360 if (!spin_trylock_irqsave(&watchdog_lock, flags))
361 return;
362 clocksource_reset_watchdog();
363 spin_unlock_irqrestore(&watchdog_lock, flags);
364} 362}
365 363
366static void clocksource_enqueue_watchdog(struct clocksource *cs) 364static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -494,6 +492,22 @@ void clocksource_touch_watchdog(void)
494} 492}
495 493
496/** 494/**
495 * clocksource_max_adjustment- Returns max adjustment amount
496 * @cs: Pointer to clocksource
497 *
498 */
499static u32 clocksource_max_adjustment(struct clocksource *cs)
500{
501 u64 ret;
502 /*
503 * We won't try to correct for more then 11% adjustments (110,000 ppm),
504 */
505 ret = (u64)cs->mult * 11;
506 do_div(ret,100);
507 return (u32)ret;
508}
509
510/**
497 * clocksource_max_deferment - Returns max time the clocksource can be deferred 511 * clocksource_max_deferment - Returns max time the clocksource can be deferred
498 * @cs: Pointer to clocksource 512 * @cs: Pointer to clocksource
499 * 513 *
@@ -505,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
505 /* 519 /*
506 * Calculate the maximum number of cycles that we can pass to the 520 * Calculate the maximum number of cycles that we can pass to the
507 * cyc2ns function without overflowing a 64-bit signed result. The 521 * cyc2ns function without overflowing a 64-bit signed result. The
508 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which 522 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
509 * is equivalent to the below. 523 * which is equivalent to the below.
510 * max_cycles < (2^63)/cs->mult 524 * max_cycles < (2^63)/(cs->mult + cs->maxadj)
511 * max_cycles < 2^(log2((2^63)/cs->mult)) 525 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
512 * max_cycles < 2^(log2(2^63) - log2(cs->mult)) 526 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
513 * max_cycles < 2^(63 - log2(cs->mult)) 527 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
514 * max_cycles < 1 << (63 - log2(cs->mult)) 528 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
515 * Please note that we add 1 to the result of the log2 to account for 529 * Please note that we add 1 to the result of the log2 to account for
516 * any rounding errors, ensure the above inequality is satisfied and 530 * any rounding errors, ensure the above inequality is satisfied and
517 * no overflow will occur. 531 * no overflow will occur.
518 */ 532 */
519 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); 533 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
520 534
521 /* 535 /*
522 * The actual maximum number of cycles we can defer the clocksource is 536 * The actual maximum number of cycles we can defer the clocksource is
523 * determined by the minimum of max_cycles and cs->mask. 537 * determined by the minimum of max_cycles and cs->mask.
538 * Note: Here we subtract the maxadj to make sure we don't sleep for
539 * too long if there's a large negative adjustment.
524 */ 540 */
525 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 541 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
526 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); 542 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
543 cs->shift);
527 544
528 /* 545 /*
529 * To ensure that the clocksource does not wrap whilst we are idle, 546 * To ensure that the clocksource does not wrap whilst we are idle,
@@ -531,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 * note a margin of 12.5% is used because this can be computed with 548 * note a margin of 12.5% is used because this can be computed with
532 * a shift, versus say 10% which would require division. 549 * a shift, versus say 10% which would require division.
533 */ 550 */
534 return max_nsecs - (max_nsecs >> 5); 551 return max_nsecs - (max_nsecs >> 3);
535} 552}
536 553
537#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 554#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -630,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs)
630 647
631/** 648/**
632 * __clocksource_updatefreq_scale - Used update clocksource with new freq 649 * __clocksource_updatefreq_scale - Used update clocksource with new freq
633 * @t: clocksource to be registered 650 * @cs: clocksource to be registered
634 * @scale: Scale factor multiplied against freq to get clocksource hz 651 * @scale: Scale factor multiplied against freq to get clocksource hz
635 * @freq: clocksource frequency (cycles per second) divided by scale 652 * @freq: clocksource frequency (cycles per second) divided by scale
636 * 653 *
@@ -642,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs)
642void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 659void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
643{ 660{
644 u64 sec; 661 u64 sec;
645
646 /* 662 /*
647 * Calc the maximum number of seconds which we can run before 663 * Calc the maximum number of seconds which we can run before
648 * wrapping around. For clocksources which have a mask > 32bit 664 * wrapping around. For clocksources which have a mask > 32bit
@@ -653,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653 * ~ 0.06ppm granularity for NTP. We apply the same 12.5% 669 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
654 * margin as we do in clocksource_max_deferment() 670 * margin as we do in clocksource_max_deferment()
655 */ 671 */
656 sec = (cs->mask - (cs->mask >> 5)); 672 sec = (cs->mask - (cs->mask >> 3));
657 do_div(sec, freq); 673 do_div(sec, freq);
658 do_div(sec, scale); 674 do_div(sec, scale);
659 if (!sec) 675 if (!sec)
@@ -663,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
663 679
664 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 680 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
665 NSEC_PER_SEC / scale, sec * scale); 681 NSEC_PER_SEC / scale, sec * scale);
682
683 /*
684 * for clocksources that have large mults, to avoid overflow.
685 * Since mult may be adjusted by ntp, add an safety extra margin
686 *
687 */
688 cs->maxadj = clocksource_max_adjustment(cs);
689 while ((cs->mult + cs->maxadj < cs->mult)
690 || (cs->mult - cs->maxadj > cs->mult)) {
691 cs->mult >>= 1;
692 cs->shift--;
693 cs->maxadj = clocksource_max_adjustment(cs);
694 }
695
666 cs->max_idle_ns = clocksource_max_deferment(cs); 696 cs->max_idle_ns = clocksource_max_deferment(cs);
667} 697}
668EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 698EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
669 699
670/** 700/**
671 * __clocksource_register_scale - Used to install new clocksources 701 * __clocksource_register_scale - Used to install new clocksources
672 * @t: clocksource to be registered 702 * @cs: clocksource to be registered
673 * @scale: Scale factor multiplied against freq to get clocksource hz 703 * @scale: Scale factor multiplied against freq to get clocksource hz
674 * @freq: clocksource frequency (cycles per second) divided by scale 704 * @freq: clocksource frequency (cycles per second) divided by scale
675 * 705 *
@@ -697,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
697 727
698/** 728/**
699 * clocksource_register - Used to install new clocksources 729 * clocksource_register - Used to install new clocksources
700 * @t: clocksource to be registered 730 * @cs: clocksource to be registered
701 * 731 *
702 * Returns -EBUSY if registration fails, zero otherwise. 732 * Returns -EBUSY if registration fails, zero otherwise.
703 */ 733 */
704int clocksource_register(struct clocksource *cs) 734int clocksource_register(struct clocksource *cs)
705{ 735{
736 /* calculate max adjustment for given mult/shift */
737 cs->maxadj = clocksource_max_adjustment(cs);
738 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
739 "Clocksource %s might overflow on 11%% adjustment\n",
740 cs->name);
741
706 /* calculate max idle time permitted for this clocksource */ 742 /* calculate max idle time permitted for this clocksource */
707 cs->max_idle_ns = clocksource_max_deferment(cs); 743 cs->max_idle_ns = clocksource_max_deferment(cs);
708 744
@@ -725,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
725 761
726/** 762/**
727 * clocksource_change_rating - Change the rating of a registered clocksource 763 * clocksource_change_rating - Change the rating of a registered clocksource
764 * @cs: clocksource to be changed
765 * @rating: new rating
728 */ 766 */
729void clocksource_change_rating(struct clocksource *cs, int rating) 767void clocksource_change_rating(struct clocksource *cs, int rating)
730{ 768{
@@ -736,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating);
736 774
737/** 775/**
738 * clocksource_unregister - remove a registered clocksource 776 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered
739 */ 778 */
740void clocksource_unregister(struct clocksource *cs) 779void clocksource_unregister(struct clocksource *cs)
741{ 780{
@@ -751,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister);
751/** 790/**
752 * sysfs_show_current_clocksources - sysfs interface for current clocksource 791 * sysfs_show_current_clocksources - sysfs interface for current clocksource
753 * @dev: unused 792 * @dev: unused
793 * @attr: unused
754 * @buf: char buffer to be filled with clocksource list 794 * @buf: char buffer to be filled with clocksource list
755 * 795 *
756 * Provides sysfs interface for listing current clocksource. 796 * Provides sysfs interface for listing current clocksource.
757 */ 797 */
758static ssize_t 798static ssize_t
759sysfs_show_current_clocksources(struct sys_device *dev, 799sysfs_show_current_clocksources(struct device *dev,
760 struct sysdev_attribute *attr, char *buf) 800 struct device_attribute *attr, char *buf)
761{ 801{
762 ssize_t count = 0; 802 ssize_t count = 0;
763 803
@@ -771,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev,
771/** 811/**
772 * sysfs_override_clocksource - interface for manually overriding clocksource 812 * sysfs_override_clocksource - interface for manually overriding clocksource
773 * @dev: unused 813 * @dev: unused
814 * @attr: unused
774 * @buf: name of override clocksource 815 * @buf: name of override clocksource
775 * @count: length of buffer 816 * @count: length of buffer
776 * 817 *
777 * Takes input from sysfs interface for manually overriding the default 818 * Takes input from sysfs interface for manually overriding the default
778 * clocksource selection. 819 * clocksource selection.
779 */ 820 */
780static ssize_t sysfs_override_clocksource(struct sys_device *dev, 821static ssize_t sysfs_override_clocksource(struct device *dev,
781 struct sysdev_attribute *attr, 822 struct device_attribute *attr,
782 const char *buf, size_t count) 823 const char *buf, size_t count)
783{ 824{
784 size_t ret = count; 825 size_t ret = count;
@@ -806,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
806/** 847/**
807 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
808 * @dev: unused 849 * @dev: unused
850 * @attr: unused
809 * @buf: char buffer to be filled with clocksource list 851 * @buf: char buffer to be filled with clocksource list
810 * 852 *
811 * Provides sysfs interface for listing registered clocksources 853 * Provides sysfs interface for listing registered clocksources
812 */ 854 */
813static ssize_t 855static ssize_t
814sysfs_show_available_clocksources(struct sys_device *dev, 856sysfs_show_available_clocksources(struct device *dev,
815 struct sysdev_attribute *attr, 857 struct device_attribute *attr,
816 char *buf) 858 char *buf)
817{ 859{
818 struct clocksource *src; 860 struct clocksource *src;
@@ -841,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
841/* 883/*
842 * Sysfs setup bits: 884 * Sysfs setup bits:
843 */ 885 */
844static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
845 sysfs_override_clocksource); 887 sysfs_override_clocksource);
846 888
847static SYSDEV_ATTR(available_clocksource, 0444, 889static DEVICE_ATTR(available_clocksource, 0444,
848 sysfs_show_available_clocksources, NULL); 890 sysfs_show_available_clocksources, NULL);
849 891
850static struct sysdev_class clocksource_sysclass = { 892static struct bus_type clocksource_subsys = {
851 .name = "clocksource", 893 .name = "clocksource",
894 .dev_name = "clocksource",
852}; 895};
853 896
854static struct sys_device device_clocksource = { 897static struct device device_clocksource = {
855 .id = 0, 898 .id = 0,
856 .cls = &clocksource_sysclass, 899 .bus = &clocksource_subsys,
857}; 900};
858 901
859static int __init init_clocksource_sysfs(void) 902static int __init init_clocksource_sysfs(void)
860{ 903{
861 int error = sysdev_class_register(&clocksource_sysclass); 904 int error = subsys_system_register(&clocksource_subsys, NULL);
862 905
863 if (!error) 906 if (!error)
864 error = sysdev_register(&device_clocksource); 907 error = device_register(&device_clocksource);
865 if (!error) 908 if (!error)
866 error = sysdev_create_file( 909 error = device_create_file(
867 &device_clocksource, 910 &device_clocksource,
868 &attr_current_clocksource); 911 &dev_attr_current_clocksource);
869 if (!error) 912 if (!error)
870 error = sysdev_create_file( 913 error = device_create_file(
871 &device_clocksource, 914 &device_clocksource,
872 &attr_available_clocksource); 915 &dev_attr_available_clocksource);
873 return error; 916 return error;
874} 917}
875 918
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c340ca658f37..ce033c7aa2e8 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,6 +18,7 @@
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/export.h>
21#include <linux/file.h> 22#include <linux/file.h>
22#include <linux/posix-clock.h> 23#include <linux/posix-clock.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d132738..fd4a7b1625a2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
71 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 71 (dev->features & CLOCK_EVT_FEAT_C3STOP))
72 return 0; 72 return 0;
73 73
74 clockevents_exchange_device(NULL, dev); 74 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
75 tick_broadcast_device.evtdev = dev; 75 tick_broadcast_device.evtdev = dev;
76 if (!cpumask_empty(tick_get_broadcast_mask())) 76 if (!cpumask_empty(tick_get_broadcast_mask()))
77 tick_broadcast_start_periodic(dev); 77 tick_broadcast_start_periodic(dev);
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
194 for (next = dev->next_event; ;) { 194 for (next = dev->next_event; ;) {
195 next = ktime_add(next, tick_period); 195 next = ktime_add(next, tick_period);
196 196
197 if (!clockevents_program_event(dev, next, ktime_get())) 197 if (!clockevents_program_event(dev, next, false))
198 return; 198 return;
199 tick_do_periodic_broadcast(); 199 tick_do_periodic_broadcast();
200 } 200 }
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
373{ 373{
374 struct clock_event_device *bc = tick_broadcast_device.evtdev; 374 struct clock_event_device *bc = tick_broadcast_device.evtdev;
375 375
376 return tick_dev_program_event(bc, expires, force); 376 return clockevents_program_event(bc, expires, force);
377} 377}
378 378
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 379int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528de8235..da6c9ecad4e4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
94 */ 94 */
95 next = ktime_add(dev->next_event, tick_period); 95 next = ktime_add(dev->next_event, tick_period);
96 for (;;) { 96 for (;;) {
97 if (!clockevents_program_event(dev, next, ktime_get())) 97 if (!clockevents_program_event(dev, next, false))
98 return; 98 return;
99 /* 99 /*
100 * Have to be careful here. If we're in oneshot mode, 100 * Have to be careful here. If we're in oneshot mode,
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
139 for (;;) { 139 for (;;) {
140 if (!clockevents_program_event(dev, next, ktime_get())) 140 if (!clockevents_program_event(dev, next, false))
141 return; 141 return;
142 next = ktime_add(next, tick_period); 142 next = ktime_add(next, tick_period);
143 } 143 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 1009b06d6f89..4e265b901fed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
26extern void tick_setup_oneshot(struct clock_event_device *newdev, 26extern void tick_setup_oneshot(struct clock_event_device *newdev,
27 void (*handler)(struct clock_event_device *), 27 void (*handler)(struct clock_event_device *),
28 ktime_t nextevt); 28 ktime_t nextevt);
29extern int tick_dev_program_event(struct clock_event_device *dev,
30 ktime_t expires, int force);
31extern int tick_program_event(ktime_t expires, int force); 29extern int tick_program_event(ktime_t expires, int force);
32extern void tick_oneshot_notify(void); 30extern void tick_oneshot_notify(void);
33extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 31extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2d04411a5f05..824109060a33 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,74 +21,6 @@
21 21
22#include "tick-internal.h" 22#include "tick-internal.h"
23 23
24/* Limit min_delta to a jiffie */
25#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
26
27static int tick_increase_min_delta(struct clock_event_device *dev)
28{
29 /* Nothing to do if we already reached the limit */
30 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
31 return -ETIME;
32
33 if (dev->min_delta_ns < 5000)
34 dev->min_delta_ns = 5000;
35 else
36 dev->min_delta_ns += dev->min_delta_ns >> 1;
37
38 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
39 dev->min_delta_ns = MIN_DELTA_LIMIT;
40
41 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
42 dev->name ? dev->name : "?",
43 (unsigned long long) dev->min_delta_ns);
44 return 0;
45}
46
47/**
48 * tick_program_event internal worker function
49 */
50int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
51 int force)
52{
53 ktime_t now = ktime_get();
54 int i;
55
56 for (i = 0;;) {
57 int ret = clockevents_program_event(dev, expires, now);
58
59 if (!ret || !force)
60 return ret;
61
62 dev->retries++;
63 /*
64 * We tried 3 times to program the device with the given
65 * min_delta_ns. If that's not working then we increase it
66 * and emit a warning.
67 */
68 if (++i > 2) {
69 /* Increase the min. delta and try again */
70 if (tick_increase_min_delta(dev)) {
71 /*
72 * Get out of the loop if min_delta_ns
73 * hit the limit already. That's
74 * better than staying here forever.
75 *
76 * We clear next_event so we have a
77 * chance that the box survives.
78 */
79 printk(KERN_WARNING
80 "CE: Reprogramming failure. Giving up\n");
81 dev->next_event.tv64 = KTIME_MAX;
82 return -ETIME;
83 }
84 i = 0;
85 }
86
87 now = ktime_get();
88 expires = ktime_add_ns(now, dev->min_delta_ns);
89 }
90}
91
92/** 24/**
93 * tick_program_event 25 * tick_program_event
94 */ 26 */
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
96{ 28{
97 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 29 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
98 30
99 return tick_dev_program_event(dev, expires, force); 31 return clockevents_program_event(dev, expires, force);
100} 32}
101 33
102/** 34/**
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force)
104 */ 36 */
105void tick_resume_oneshot(void) 37void tick_resume_oneshot(void)
106{ 38{
107 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
108 struct clock_event_device *dev = td->evtdev;
109 40
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 41 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
111 tick_program_event(ktime_get(), 1); 42 clockevents_program_event(dev, ktime_get(), true);
112} 43}
113 44
114/** 45/**
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
120{ 51{
121 newdev->event_handler = handler; 52 newdev->event_handler = handler;
122 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 53 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
123 tick_dev_program_event(newdev, next_event, 1); 54 clockevents_program_event(newdev, next_event, true);
124} 55}
125 56
126/** 57/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
140 unsigned long flags; 140 unsigned long flags;
141 141
142 cpumask_clear_cpu(cpu, nohz_cpu_mask);
143 ts->idle_waketime = now; 142 ts->idle_waketime = now;
144 143
145 local_irq_save(flags); 144 local_irq_save(flags);
@@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
159 158
160 if (ts->idle_active) { 159 if (ts->idle_active) {
161 delta = ktime_sub(now, ts->idle_entrytime); 160 delta = ktime_sub(now, ts->idle_entrytime);
162 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
163 if (nr_iowait_cpu(cpu) > 0) 161 if (nr_iowait_cpu(cpu) > 0)
164 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 162 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
163 else
164 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 ts->idle_entrytime = now; 165 ts->idle_entrytime = now;
166 } 166 }
167 167
@@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
197/** 197/**
198 * get_cpu_idle_time_us - get the total idle time of a cpu 198 * get_cpu_idle_time_us - get the total idle time of a cpu
199 * @cpu: CPU number to query 199 * @cpu: CPU number to query
200 * @last_update_time: variable to store update time in 200 * @last_update_time: variable to store update time in. Do not update
201 * counters if NULL.
201 * 202 *
202 * Return the cummulative idle time (since boot) for a given 203 * Return the cummulative idle time (since boot) for a given
203 * CPU, in microseconds. The idle time returned includes 204 * CPU, in microseconds.
204 * the iowait time (unlike what "top" and co report).
205 * 205 *
206 * This time is measured via accounting rather than sampling, 206 * This time is measured via accounting rather than sampling,
207 * and is as accurate as ktime_get() is. 207 * and is as accurate as ktime_get() is.
@@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
211u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 211u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
212{ 212{
213 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 213 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
214 ktime_t now, idle;
214 215
215 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
216 return -1; 217 return -1;
217 218
218 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); 219 now = ktime_get();
220 if (last_update_time) {
221 update_ts_time_stats(cpu, ts, now, last_update_time);
222 idle = ts->idle_sleeptime;
223 } else {
224 if (ts->idle_active && !nr_iowait_cpu(cpu)) {
225 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
226
227 idle = ktime_add(ts->idle_sleeptime, delta);
228 } else {
229 idle = ts->idle_sleeptime;
230 }
231 }
232
233 return ktime_to_us(idle);
219 234
220 return ktime_to_us(ts->idle_sleeptime);
221} 235}
222EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 236EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
223 237
224/* 238/**
225 * get_cpu_iowait_time_us - get the total iowait time of a cpu 239 * get_cpu_iowait_time_us - get the total iowait time of a cpu
226 * @cpu: CPU number to query 240 * @cpu: CPU number to query
227 * @last_update_time: variable to store update time in 241 * @last_update_time: variable to store update time in. Do not update
242 * counters if NULL.
228 * 243 *
229 * Return the cummulative iowait time (since boot) for a given 244 * Return the cummulative iowait time (since boot) for a given
230 * CPU, in microseconds. 245 * CPU, in microseconds.
@@ -237,52 +252,40 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
237u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 252u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
238{ 253{
239 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 254 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
255 ktime_t now, iowait;
240 256
241 if (!tick_nohz_enabled) 257 if (!tick_nohz_enabled)
242 return -1; 258 return -1;
243 259
244 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); 260 now = ktime_get();
261 if (last_update_time) {
262 update_ts_time_stats(cpu, ts, now, last_update_time);
263 iowait = ts->iowait_sleeptime;
264 } else {
265 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
266 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
245 267
246 return ktime_to_us(ts->iowait_sleeptime); 268 iowait = ktime_add(ts->iowait_sleeptime, delta);
269 } else {
270 iowait = ts->iowait_sleeptime;
271 }
272 }
273
274 return ktime_to_us(iowait);
247} 275}
248EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
249 277
250/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
251 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
252 *
253 * When the next event is more than a tick into the future, stop the idle tick
254 * Called either from the idle loop or from irq_exit() when an idle period was
255 * just interrupted by an interrupt which did not cause a reschedule.
256 */
257void tick_nohz_stop_sched_tick(int inidle)
258{ 279{
259 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
260 struct tick_sched *ts;
261 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
262 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
263 u64 time_delta; 283 u64 time_delta;
264 int cpu; 284 int cpu;
265 285
266 local_irq_save(flags);
267
268 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
269 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
270 288
271 /*
272 * Call to tick_nohz_start_idle stops the last_update_time from being
273 * updated. Thus, it must not be called in the event we are called from
274 * irq_exit() with the prior state different than idle.
275 */
276 if (!inidle && !ts->inidle)
277 goto end;
278
279 /*
280 * Set ts->inidle unconditionally. Even if the system did not
281 * switch to NOHZ mode the cpu frequency governers rely on the
282 * update of the idle time accounting in tick_nohz_start_idle().
283 */
284 ts->inidle = 1;
285
286 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
287 290
288 /* 291 /*
@@ -298,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
298 } 301 }
299 302
300 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
301 goto end; 304 return;
302 305
303 if (need_resched()) 306 if (need_resched())
304 goto end; 307 return;
305 308
306 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
307 static int ratelimit; 310 static int ratelimit;
@@ -311,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
311 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
312 ratelimit++; 315 ratelimit++;
313 } 316 }
314 goto end; 317 return;
315 } 318 }
316 319
317 ts->idle_calls++; 320 ts->idle_calls++;
@@ -389,9 +392,6 @@ void tick_nohz_stop_sched_tick(int inidle)
389 else 392 else
390 expires.tv64 = KTIME_MAX; 393 expires.tv64 = KTIME_MAX;
391 394
392 if (delta_jiffies > 1)
393 cpumask_set_cpu(cpu, nohz_cpu_mask);
394
395 /* Skip reprogram of event if its not changed */ 395 /* Skip reprogram of event if its not changed */
396 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 396 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
397 goto out; 397 goto out;
@@ -409,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
410 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
411 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
412 rcu_enter_nohz();
413 } 412 }
414 413
415 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -441,15 +440,70 @@ void tick_nohz_stop_sched_tick(int inidle)
441 * softirq. 440 * softirq.
442 */ 441 */
443 tick_do_update_jiffies64(ktime_get()); 442 tick_do_update_jiffies64(ktime_get());
444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
445 } 443 }
446 raise_softirq_irqoff(TIMER_SOFTIRQ); 444 raise_softirq_irqoff(TIMER_SOFTIRQ);
447out: 445out:
448 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
449 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
450 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
451end: 449}
452 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
477 local_irq_disable();
478
479 ts = &__get_cpu_var(tick_cpu_sched);
480 /*
481 * set ts->inidle unconditionally. even if the system did not
482 * switch to nohz mode the cpu frequency governers rely on the
483 * update of the idle time accounting in tick_nohz_start_idle().
484 */
485 ts->inidle = 1;
486 tick_nohz_stop_sched_tick(ts);
487
488 local_irq_enable();
489}
490
491/**
492 * tick_nohz_irq_exit - update next tick event from interrupt exit
493 *
494 * When an interrupt fires while we are idle and it doesn't cause
495 * a reschedule, it may still add, modify or delete a timer, enqueue
496 * an RCU callback, etc...
497 * So we need to re-calculate and reprogram the next tick event.
498 */
499void tick_nohz_irq_exit(void)
500{
501 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
502
503 if (!ts->inidle)
504 return;
505
506 tick_nohz_stop_sched_tick(ts);
453} 507}
454 508
455/** 509/**
@@ -491,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
491} 545}
492 546
493/** 547/**
494 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 548 * tick_nohz_idle_exit - restart the idle tick from the idle task
495 * 549 *
496 * Restart the idle tick when the CPU is woken up from idle 550 * Restart the idle tick when the CPU is woken up from idle
551 * This also exit the RCU extended quiescent state. The CPU
552 * can use RCU again after this function is called.
497 */ 553 */
498void tick_nohz_restart_sched_tick(void) 554void tick_nohz_idle_exit(void)
499{ 555{
500 int cpu = smp_processor_id(); 556 int cpu = smp_processor_id();
501 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 557 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -505,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
505 ktime_t now; 561 ktime_t now;
506 562
507 local_irq_disable(); 563 local_irq_disable();
564
508 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 565 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
509 now = ktime_get(); 566 now = ktime_get();
510 567
@@ -519,12 +576,9 @@ void tick_nohz_restart_sched_tick(void)
519 576
520 ts->inidle = 0; 577 ts->inidle = 0;
521 578
522 rcu_exit_nohz();
523
524 /* Update jiffies first */ 579 /* Update jiffies first */
525 select_nohz_load_balancer(0); 580 select_nohz_load_balancer(0);
526 tick_do_update_jiffies64(now); 581 tick_do_update_jiffies64(now);
527 cpumask_clear_cpu(cpu, nohz_cpu_mask);
528 582
529#ifndef CONFIG_VIRT_CPU_ACCOUNTING 583#ifndef CONFIG_VIRT_CPU_ACCOUNTING
530 /* 584 /*
@@ -640,8 +694,6 @@ static void tick_nohz_switch_to_nohz(void)
640 next = ktime_add(next, tick_period); 694 next = ktime_add(next, tick_period);
641 } 695 }
642 local_irq_enable(); 696 local_irq_enable();
643
644 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
645} 697}
646 698
647/* 699/*
@@ -793,10 +845,8 @@ void tick_setup_sched_timer(void)
793 } 845 }
794 846
795#ifdef CONFIG_NO_HZ 847#ifdef CONFIG_NO_HZ
796 if (tick_nohz_enabled) { 848 if (tick_nohz_enabled)
797 ts->nohz_mode = NOHZ_MODE_HIGHRES; 849 ts->nohz_mode = NOHZ_MODE_HIGHRES;
798 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
799 }
800#endif 850#endif
801} 851}
802#endif /* HIGH_RES_TIMERS */ 852#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2b021b0e8507..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
131 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
133 133
134 /* return delta convert to nanoseconds using ntp adjusted mult. */ 134 /* return delta convert to nanoseconds. */
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 136}
137 137
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 251 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset();
252 254
253 } while (read_seqretry(&xtime_lock, seq)); 255 } while (read_seqretry(&xtime_lock, seq));
254 /* 256 /*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
280 *ts = xtime; 282 *ts = xtime;
281 tomono = wall_to_monotonic; 283 tomono = wall_to_monotonic;
282 nsecs = timekeeping_get_ns(); 284 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset();
283 287
284 } while (read_seqretry(&xtime_lock, seq)); 288 } while (read_seqretry(&xtime_lock, seq));
285 289
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset)
802 s64 error, interval = timekeeper.cycle_interval; 806 s64 error, interval = timekeeper.cycle_interval;
803 int adj; 807 int adj;
804 808
809 /*
810 * The point of this is to check if the error is greater then half
811 * an interval.
812 *
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
814 *
815 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval.
819 *
820 * Note: It does not "save" on aggravation when reading the code.
821 */
805 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
806 if (error > interval) { 823 if (error > interval) {
824 /*
825 * We now divide error by 4(via shift), which checks if
826 * the error is greater then twice the interval.
827 * If it is greater, we need a bigadjust, if its smaller,
828 * we can adjust by 1.
829 */
807 error >>= 2; 830 error >>= 2;
831 /*
832 * XXX - In update_wall_time, we round up to the next
833 * nanosecond, and store the amount rounded up into
834 * the error. This causes the likely below to be unlikely.
835 *
836 * The proper fix is to avoid rounding up by using
837 * the high precision timekeeper.xtime_nsec instead of
838 * xtime.tv_nsec everywhere. Fixing this will take some
839 * time.
840 */
808 if (likely(error <= interval)) 841 if (likely(error <= interval))
809 adj = 1; 842 adj = 1;
810 else 843 else
811 adj = timekeeping_bigadjust(error, &interval, &offset); 844 adj = timekeeping_bigadjust(error, &interval, &offset);
812 } else if (error < -interval) { 845 } else if (error < -interval) {
846 /* See comment above, this is just switched for the negative */
813 error >>= 2; 847 error >>= 2;
814 if (likely(error >= -interval)) { 848 if (likely(error >= -interval)) {
815 adj = -1; 849 adj = -1;
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset)
817 offset = -offset; 851 offset = -offset;
818 } else 852 } else
819 adj = timekeeping_bigadjust(error, &interval, &offset); 853 adj = timekeeping_bigadjust(error, &interval, &offset);
820 } else 854 } else /* No adjustment needed */
821 return; 855 return;
822 856
857 WARN_ONCE(timekeeper.clock->maxadj &&
858 (timekeeper.mult + adj > timekeeper.clock->mult +
859 timekeeper.clock->maxadj),
860 "Adjusting %s more then 11%% (%ld vs %ld)\n",
861 timekeeper.clock->name, (long)timekeeper.mult + adj,
862 (long)timekeeper.clock->mult +
863 timekeeper.clock->maxadj);
864 /*
865 * So the following can be confusing.
866 *
867 * To keep things simple, lets assume adj == 1 for now.
868 *
869 * When adj != 1, remember that the interval and offset values
870 * have been appropriately scaled so the math is the same.
871 *
872 * The basic idea here is that we're increasing the multiplier
873 * by one, this causes the xtime_interval to be incremented by
874 * one cycle_interval. This is because:
875 * xtime_interval = cycle_interval * mult
876 * So if mult is being incremented by one:
877 * xtime_interval = cycle_interval * (mult + 1)
878 * Its the same as:
879 * xtime_interval = (cycle_interval * mult) + cycle_interval
880 * Which can be shortened to:
881 * xtime_interval += cycle_interval
882 *
883 * So offset stores the non-accumulated cycles. Thus the current
884 * time (in shifted nanoseconds) is:
885 * now = (offset * adj) + xtime_nsec
886 * Now, even though we're adjusting the clock frequency, we have
887 * to keep time consistent. In other words, we can't jump back
888 * in time, and we also want to avoid jumping forward in time.
889 *
890 * So given the same offset value, we need the time to be the same
891 * both before and after the freq adjustment.
892 * now = (offset * adj_1) + xtime_nsec_1
893 * now = (offset * adj_2) + xtime_nsec_2
894 * So:
895 * (offset * adj_1) + xtime_nsec_1 =
896 * (offset * adj_2) + xtime_nsec_2
897 * And we know:
898 * adj_2 = adj_1 + 1
899 * So:
900 * (offset * adj_1) + xtime_nsec_1 =
901 * (offset * (adj_1+1)) + xtime_nsec_2
902 * (offset * adj_1) + xtime_nsec_1 =
903 * (offset * adj_1) + offset + xtime_nsec_2
904 * Canceling the sides:
905 * xtime_nsec_1 = offset + xtime_nsec_2
906 * Which gives us:
907 * xtime_nsec_2 = xtime_nsec_1 - offset
908 * Which simplfies to:
909 * xtime_nsec -= offset
910 *
911 * XXX - TODO: Doc ntp_error calculation.
912 */
823 timekeeper.mult += adj; 913 timekeeper.mult += adj;
824 timekeeper.xtime_interval += interval; 914 timekeeper.xtime_interval += interval;
825 timekeeper.xtime_nsec -= offset; 915 timekeeper.xtime_nsec -= offset;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd8..0b537f27b559 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
81/* 81/*
82 * Spinlock protecting the tables - not taken during lookup: 82 * Spinlock protecting the tables - not taken during lookup:
83 */ 83 */
84static DEFINE_SPINLOCK(table_lock); 84static DEFINE_RAW_SPINLOCK(table_lock);
85 85
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
188 prev = NULL; 188 prev = NULL;
189 curr = *head; 189 curr = *head;
190 190
191 spin_lock(&table_lock); 191 raw_spin_lock(&table_lock);
192 /* 192 /*
193 * Make sure we have not raced with another CPU: 193 * Make sure we have not raced with another CPU:
194 */ 194 */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
215 *head = curr; 215 *head = curr;
216 } 216 }
217 out_unlock: 217 out_unlock:
218 spin_unlock(&table_lock); 218 raw_spin_unlock(&table_lock);
219 219
220 return curr; 220 return curr;
221} 221}
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff36119e4d..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/module.h> 23#include <linux/export.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/init.h> 26#include <linux/init.h>
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
427 } 427 }
428} 428}
429 429
430/* Stub timer callback for improperly used timers. */
431static void stub_timer(unsigned long data)
432{
433 WARN_ON(1);
434}
435
430/* 436/*
431 * fixup_activate is called when: 437 * fixup_activate is called when:
432 * - an active object is activated 438 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
450 debug_object_activate(timer, &timer_debug_descr); 456 debug_object_activate(timer, &timer_debug_descr);
451 return 0; 457 return 0;
452 } else { 458 } else {
453 WARN_ON_ONCE(1); 459 setup_timer(timer, stub_timer, 0);
460 return 1;
454 } 461 }
455 return 0; 462 return 0;
456 463
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
480 } 487 }
481} 488}
482 489
490/*
491 * fixup_assert_init is called when:
492 * - an untracked/uninit-ed object is found
493 */
494static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
495{
496 struct timer_list *timer = addr;
497
498 switch (state) {
499 case ODEBUG_STATE_NOTAVAILABLE:
500 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
501 /*
502 * This is not really a fixup. The timer was
503 * statically initialized. We just make sure that it
504 * is tracked in the object tracker.
505 */
506 debug_object_init(timer, &timer_debug_descr);
507 return 0;
508 } else {
509 setup_timer(timer, stub_timer, 0);
510 return 1;
511 }
512 default:
513 return 0;
514 }
515}
516
483static struct debug_obj_descr timer_debug_descr = { 517static struct debug_obj_descr timer_debug_descr = {
484 .name = "timer_list", 518 .name = "timer_list",
485 .debug_hint = timer_debug_hint, 519 .debug_hint = timer_debug_hint,
486 .fixup_init = timer_fixup_init, 520 .fixup_init = timer_fixup_init,
487 .fixup_activate = timer_fixup_activate, 521 .fixup_activate = timer_fixup_activate,
488 .fixup_free = timer_fixup_free, 522 .fixup_free = timer_fixup_free,
523 .fixup_assert_init = timer_fixup_assert_init,
489}; 524};
490 525
491static inline void debug_timer_init(struct timer_list *timer) 526static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
508 debug_object_free(timer, &timer_debug_descr); 543 debug_object_free(timer, &timer_debug_descr);
509} 544}
510 545
546static inline void debug_timer_assert_init(struct timer_list *timer)
547{
548 debug_object_assert_init(timer, &timer_debug_descr);
549}
550
511static void __init_timer(struct timer_list *timer, 551static void __init_timer(struct timer_list *timer,
512 const char *name, 552 const char *name,
513 struct lock_class_key *key); 553 struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
531static inline void debug_timer_init(struct timer_list *timer) { } 571static inline void debug_timer_init(struct timer_list *timer) { }
532static inline void debug_timer_activate(struct timer_list *timer) { } 572static inline void debug_timer_activate(struct timer_list *timer) { }
533static inline void debug_timer_deactivate(struct timer_list *timer) { } 573static inline void debug_timer_deactivate(struct timer_list *timer) { }
574static inline void debug_timer_assert_init(struct timer_list *timer) { }
534#endif 575#endif
535 576
536static inline void debug_init(struct timer_list *timer) 577static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
552 trace_timer_cancel(timer); 593 trace_timer_cancel(timer);
553} 594}
554 595
596static inline void debug_assert_init(struct timer_list *timer)
597{
598 debug_timer_assert_init(timer);
599}
600
555static void __init_timer(struct timer_list *timer, 601static void __init_timer(struct timer_list *timer,
556 const char *name, 602 const char *name,
557 struct lock_class_key *key) 603 struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
902 unsigned long flags; 948 unsigned long flags;
903 int ret = 0; 949 int ret = 0;
904 950
951 debug_assert_init(timer);
952
905 timer_stats_timer_clear_start_info(timer); 953 timer_stats_timer_clear_start_info(timer);
906 if (timer_pending(timer)) { 954 if (timer_pending(timer)) {
907 base = lock_timer_base(timer, &flags); 955 base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
932 unsigned long flags; 980 unsigned long flags;
933 int ret = -1; 981 int ret = -1;
934 982
983 debug_assert_init(timer);
984
935 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
936 986
937 if (base->running_timer == timer) 987 if (base->running_timer == timer)
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid)
1368 int pid; 1418 int pid;
1369 1419
1370 rcu_read_lock(); 1420 rcu_read_lock();
1371 pid = task_tgid_vnr(current->real_parent); 1421 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1372 rcu_read_unlock(); 1422 rcu_read_unlock();
1373 1423
1374 return pid; 1424 return pid;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..5f39a07fe5ea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18CFLAGS_trace_events_filter.o := -I$(src)
19
18# 20#
19# Make the trace clocks available generally: it's infrastructure 21# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example: 22# relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif
56ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 63endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7c910a5593a6..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/export.h>
26#include <linux/time.h> 27#include <linux/time.h>
27#include <linux/uaccess.h> 28#include <linux/uaccess.h>
28 29
@@ -401,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
401 402
402static struct dentry *blk_create_buf_file_callback(const char *filename, 403static struct dentry *blk_create_buf_file_callback(const char *filename,
403 struct dentry *parent, 404 struct dentry *parent,
404 int mode, 405 umode_t mode,
405 struct rchan_buf *buf, 406 struct rchan_buf *buf,
406 int *is_global) 407 int *is_global)
407{ 408{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e7829..b1e8943fed1d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/module.h>
25#include <linux/ftrace.h> 26#include <linux/ftrace.h>
26#include <linux/sysctl.h> 27#include <linux/sysctl.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
@@ -151,7 +152,6 @@ void clear_ftrace_function(void)
151 ftrace_pid_function = ftrace_stub; 152 ftrace_pid_function = ftrace_stub;
152} 153}
153 154
154#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
156/* 156/*
157 * For those archs that do not test ftrace_trace_stop in their 157 * For those archs that do not test ftrace_trace_stop in their
@@ -1211,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1211 if (!src->count) { 1211 if (!src->count) {
1212 free_ftrace_hash_rcu(*dst); 1212 free_ftrace_hash_rcu(*dst);
1213 rcu_assign_pointer(*dst, EMPTY_HASH); 1213 rcu_assign_pointer(*dst, EMPTY_HASH);
1214 return 0; 1214 /* still need to update the function records */
1215 ret = 0;
1216 goto out;
1215 } 1217 }
1216 1218
1217 /* 1219 /*
@@ -3863,6 +3865,14 @@ void ftrace_kill(void)
3863} 3865}
3864 3866
3865/** 3867/**
3868 * Test if ftrace is dead or not.
3869 */
3870int ftrace_is_dead(void)
3871{
3872 return ftrace_disabled;
3873}
3874
3875/**
3866 * register_ftrace_function - register a function for profiling 3876 * register_ftrace_function - register a function for profiling
3867 * @ops - ops structure that holds the function for profiling. 3877 * @ops - ops structure that holds the function for profiling.
3868 * 3878 *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..f5b7b5c1195b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
478 int cpu; 478 int cpu;
479 atomic_t record_disabled; 479 atomic_t record_disabled;
480 struct ring_buffer *buffer; 480 struct ring_buffer *buffer;
481 spinlock_t reader_lock; /* serialize readers */ 481 raw_spinlock_t reader_lock; /* serialize readers */
482 arch_spinlock_t lock; 482 arch_spinlock_t lock;
483 struct lock_class_key lock_key; 483 struct lock_class_key lock_key;
484 struct list_head *pages; 484 struct list_head *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
488 struct buffer_page *reader_page; 488 struct buffer_page *reader_page;
489 unsigned long lost_events; 489 unsigned long lost_events;
490 unsigned long last_overrun; 490 unsigned long last_overrun;
491 local_t entries_bytes;
491 local_t commit_overrun; 492 local_t commit_overrun;
492 local_t overrun; 493 local_t overrun;
493 local_t entries; 494 local_t entries;
494 local_t committing; 495 local_t committing;
495 local_t commits; 496 local_t commits;
496 unsigned long read; 497 unsigned long read;
498 unsigned long read_bytes;
497 u64 write_stamp; 499 u64 write_stamp;
498 u64 read_stamp; 500 u64 read_stamp;
499}; 501};
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1062 1064
1063 cpu_buffer->cpu = cpu; 1065 cpu_buffer->cpu = cpu;
1064 cpu_buffer->buffer = buffer; 1066 cpu_buffer->buffer = buffer;
1065 spin_lock_init(&cpu_buffer->reader_lock); 1067 raw_spin_lock_init(&cpu_buffer->reader_lock);
1066 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1068 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1067 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1069 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1068 1070
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1259 struct list_head *p; 1261 struct list_head *p;
1260 unsigned i; 1262 unsigned i;
1261 1263
1262 spin_lock_irq(&cpu_buffer->reader_lock); 1264 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1263 rb_head_page_deactivate(cpu_buffer); 1265 rb_head_page_deactivate(cpu_buffer);
1264 1266
1265 for (i = 0; i < nr_pages; i++) { 1267 for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1277 rb_check_pages(cpu_buffer); 1279 rb_check_pages(cpu_buffer);
1278 1280
1279out: 1281out:
1280 spin_unlock_irq(&cpu_buffer->reader_lock); 1282 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1281} 1283}
1282 1284
1283static void 1285static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1288 struct list_head *p; 1290 struct list_head *p;
1289 unsigned i; 1291 unsigned i;
1290 1292
1291 spin_lock_irq(&cpu_buffer->reader_lock); 1293 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1292 rb_head_page_deactivate(cpu_buffer); 1294 rb_head_page_deactivate(cpu_buffer);
1293 1295
1294 for (i = 0; i < nr_pages; i++) { 1296 for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1303 rb_check_pages(cpu_buffer); 1305 rb_check_pages(cpu_buffer);
1304 1306
1305out: 1307out:
1306 spin_unlock_irq(&cpu_buffer->reader_lock); 1308 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1307} 1309}
1308 1310
1309/** 1311/**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1708 * the counters. 1710 * the counters.
1709 */ 1711 */
1710 local_add(entries, &cpu_buffer->overrun); 1712 local_add(entries, &cpu_buffer->overrun);
1713 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1711 1714
1712 /* 1715 /*
1713 * The entries will be zeroed out when we move the 1716 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1863 event = __rb_page_index(tail_page, tail); 1866 event = __rb_page_index(tail_page, tail);
1864 kmemcheck_annotate_bitfield(event, bitfield); 1867 kmemcheck_annotate_bitfield(event, bitfield);
1865 1868
1869 /* account for padding bytes */
1870 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
1871
1866 /* 1872 /*
1867 * Save the original length to the meta data. 1873 * Save the original length to the meta data.
1868 * This will be used by the reader to add lost event 1874 * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2054 if (!tail) 2060 if (!tail)
2055 tail_page->page->time_stamp = ts; 2061 tail_page->page->time_stamp = ts;
2056 2062
2063 /* account for these added bytes */
2064 local_add(length, &cpu_buffer->entries_bytes);
2065
2057 return event; 2066 return event;
2058} 2067}
2059 2068
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2076 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 2085 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
2077 unsigned long write_mask = 2086 unsigned long write_mask =
2078 local_read(&bpage->write) & ~RB_WRITE_MASK; 2087 local_read(&bpage->write) & ~RB_WRITE_MASK;
2088 unsigned long event_length = rb_event_length(event);
2079 /* 2089 /*
2080 * This is on the tail page. It is possible that 2090 * This is on the tail page. It is possible that
2081 * a write could come in and move the tail page 2091 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2085 old_index += write_mask; 2095 old_index += write_mask;
2086 new_index += write_mask; 2096 new_index += write_mask;
2087 index = local_cmpxchg(&bpage->write, old_index, new_index); 2097 index = local_cmpxchg(&bpage->write, old_index, new_index);
2088 if (index == old_index) 2098 if (index == old_index) {
2099 /* update counters */
2100 local_sub(event_length, &cpu_buffer->entries_bytes);
2089 return 1; 2101 return 1;
2102 }
2090 } 2103 }
2091 2104
2092 /* could not discard */ 2105 /* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2661} 2674}
2662 2675
2663/** 2676/**
2677 * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
2678 * @buffer: The ring buffer
2679 * @cpu: The per CPU buffer to read from.
2680 */
2681unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2682{
2683 unsigned long flags;
2684 struct ring_buffer_per_cpu *cpu_buffer;
2685 struct buffer_page *bpage;
2686 unsigned long ret;
2687
2688 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2689 return 0;
2690
2691 cpu_buffer = buffer->buffers[cpu];
2692 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2693 /*
2694 * if the tail is on reader_page, oldest time stamp is on the reader
2695 * page
2696 */
2697 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
2698 bpage = cpu_buffer->reader_page;
2699 else
2700 bpage = rb_set_head_page(cpu_buffer);
2701 ret = bpage->page->time_stamp;
2702 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2703
2704 return ret;
2705}
2706EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
2707
2708/**
2709 * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
2710 * @buffer: The ring buffer
2711 * @cpu: The per CPU buffer to read from.
2712 */
2713unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
2714{
2715 struct ring_buffer_per_cpu *cpu_buffer;
2716 unsigned long ret;
2717
2718 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2719 return 0;
2720
2721 cpu_buffer = buffer->buffers[cpu];
2722 ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
2723
2724 return ret;
2725}
2726EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
2727
2728/**
2664 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2729 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2665 * @buffer: The ring buffer 2730 * @buffer: The ring buffer
2666 * @cpu: The per CPU buffer to get the entries from. 2731 * @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
2804 2869
2805 cpu_buffer = iter->cpu_buffer; 2870 cpu_buffer = iter->cpu_buffer;
2806 2871
2807 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2872 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2808 rb_iter_reset(iter); 2873 rb_iter_reset(iter);
2809 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2874 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2810} 2875}
2811EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); 2876EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
2812 2877
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3265 again: 3330 again:
3266 local_irq_save(flags); 3331 local_irq_save(flags);
3267 if (dolock) 3332 if (dolock)
3268 spin_lock(&cpu_buffer->reader_lock); 3333 raw_spin_lock(&cpu_buffer->reader_lock);
3269 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3334 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3270 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3335 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3271 rb_advance_reader(cpu_buffer); 3336 rb_advance_reader(cpu_buffer);
3272 if (dolock) 3337 if (dolock)
3273 spin_unlock(&cpu_buffer->reader_lock); 3338 raw_spin_unlock(&cpu_buffer->reader_lock);
3274 local_irq_restore(flags); 3339 local_irq_restore(flags);
3275 3340
3276 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3341 if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3295 unsigned long flags; 3360 unsigned long flags;
3296 3361
3297 again: 3362 again:
3298 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3363 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3299 event = rb_iter_peek(iter, ts); 3364 event = rb_iter_peek(iter, ts);
3300 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3365 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3301 3366
3302 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3367 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3303 goto again; 3368 goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3337 cpu_buffer = buffer->buffers[cpu]; 3402 cpu_buffer = buffer->buffers[cpu];
3338 local_irq_save(flags); 3403 local_irq_save(flags);
3339 if (dolock) 3404 if (dolock)
3340 spin_lock(&cpu_buffer->reader_lock); 3405 raw_spin_lock(&cpu_buffer->reader_lock);
3341 3406
3342 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3407 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3343 if (event) { 3408 if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3346 } 3411 }
3347 3412
3348 if (dolock) 3413 if (dolock)
3349 spin_unlock(&cpu_buffer->reader_lock); 3414 raw_spin_unlock(&cpu_buffer->reader_lock);
3350 local_irq_restore(flags); 3415 local_irq_restore(flags);
3351 3416
3352 out: 3417 out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
3438 3503
3439 cpu_buffer = iter->cpu_buffer; 3504 cpu_buffer = iter->cpu_buffer;
3440 3505
3441 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3506 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3442 arch_spin_lock(&cpu_buffer->lock); 3507 arch_spin_lock(&cpu_buffer->lock);
3443 rb_iter_reset(iter); 3508 rb_iter_reset(iter);
3444 arch_spin_unlock(&cpu_buffer->lock); 3509 arch_spin_unlock(&cpu_buffer->lock);
3445 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3510 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3446} 3511}
3447EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3512EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3448 3513
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3477 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3542 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3478 unsigned long flags; 3543 unsigned long flags;
3479 3544
3480 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3545 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3481 again: 3546 again:
3482 event = rb_iter_peek(iter, ts); 3547 event = rb_iter_peek(iter, ts);
3483 if (!event) 3548 if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3488 3553
3489 rb_advance_iter(iter); 3554 rb_advance_iter(iter);
3490 out: 3555 out:
3491 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3556 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3492 3557
3493 return event; 3558 return event;
3494} 3559}
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3527 cpu_buffer->reader_page->read = 0; 3592 cpu_buffer->reader_page->read = 0;
3528 3593
3529 local_set(&cpu_buffer->commit_overrun, 0); 3594 local_set(&cpu_buffer->commit_overrun, 0);
3595 local_set(&cpu_buffer->entries_bytes, 0);
3530 local_set(&cpu_buffer->overrun, 0); 3596 local_set(&cpu_buffer->overrun, 0);
3531 local_set(&cpu_buffer->entries, 0); 3597 local_set(&cpu_buffer->entries, 0);
3532 local_set(&cpu_buffer->committing, 0); 3598 local_set(&cpu_buffer->committing, 0);
3533 local_set(&cpu_buffer->commits, 0); 3599 local_set(&cpu_buffer->commits, 0);
3534 cpu_buffer->read = 0; 3600 cpu_buffer->read = 0;
3601 cpu_buffer->read_bytes = 0;
3535 3602
3536 cpu_buffer->write_stamp = 0; 3603 cpu_buffer->write_stamp = 0;
3537 cpu_buffer->read_stamp = 0; 3604 cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3557 3624
3558 atomic_inc(&cpu_buffer->record_disabled); 3625 atomic_inc(&cpu_buffer->record_disabled);
3559 3626
3560 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3627 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3561 3628
3562 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3629 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3563 goto out; 3630 goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3569 arch_spin_unlock(&cpu_buffer->lock); 3636 arch_spin_unlock(&cpu_buffer->lock);
3570 3637
3571 out: 3638 out:
3572 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3639 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3573 3640
3574 atomic_dec(&cpu_buffer->record_disabled); 3641 atomic_dec(&cpu_buffer->record_disabled);
3575} 3642}
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
3607 cpu_buffer = buffer->buffers[cpu]; 3674 cpu_buffer = buffer->buffers[cpu];
3608 local_irq_save(flags); 3675 local_irq_save(flags);
3609 if (dolock) 3676 if (dolock)
3610 spin_lock(&cpu_buffer->reader_lock); 3677 raw_spin_lock(&cpu_buffer->reader_lock);
3611 ret = rb_per_cpu_empty(cpu_buffer); 3678 ret = rb_per_cpu_empty(cpu_buffer);
3612 if (dolock) 3679 if (dolock)
3613 spin_unlock(&cpu_buffer->reader_lock); 3680 raw_spin_unlock(&cpu_buffer->reader_lock);
3614 local_irq_restore(flags); 3681 local_irq_restore(flags);
3615 3682
3616 if (!ret) 3683 if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
3641 cpu_buffer = buffer->buffers[cpu]; 3708 cpu_buffer = buffer->buffers[cpu];
3642 local_irq_save(flags); 3709 local_irq_save(flags);
3643 if (dolock) 3710 if (dolock)
3644 spin_lock(&cpu_buffer->reader_lock); 3711 raw_spin_lock(&cpu_buffer->reader_lock);
3645 ret = rb_per_cpu_empty(cpu_buffer); 3712 ret = rb_per_cpu_empty(cpu_buffer);
3646 if (dolock) 3713 if (dolock)
3647 spin_unlock(&cpu_buffer->reader_lock); 3714 raw_spin_unlock(&cpu_buffer->reader_lock);
3648 local_irq_restore(flags); 3715 local_irq_restore(flags);
3649 3716
3650 return ret; 3717 return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3841 if (!bpage) 3908 if (!bpage)
3842 goto out; 3909 goto out;
3843 3910
3844 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3911 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3845 3912
3846 reader = rb_get_reader_page(cpu_buffer); 3913 reader = rb_get_reader_page(cpu_buffer);
3847 if (!reader) 3914 if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3918 } else { 3985 } else {
3919 /* update the entry counter */ 3986 /* update the entry counter */
3920 cpu_buffer->read += rb_page_entries(reader); 3987 cpu_buffer->read += rb_page_entries(reader);
3988 cpu_buffer->read_bytes += BUF_PAGE_SIZE;
3921 3989
3922 /* swap the pages */ 3990 /* swap the pages */
3923 rb_init_page(bpage); 3991 rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3964 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); 4032 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3965 4033
3966 out_unlock: 4034 out_unlock:
3967 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 4035 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3968 4036
3969 out: 4037 out:
3970 return ret; 4038 return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000000..4b3b5eaf94d1
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/usb.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/rpm.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
18EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
19EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
20EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,10 +338,11 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
338/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
342 TRACE_ITER_IRQ_INFO;
342 343
343static int trace_stop_count; 344static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 345static DEFINE_RAW_SPINLOCK(tracing_start_lock);
345 346
346static void wakeup_work_handler(struct work_struct *work) 347static void wakeup_work_handler(struct work_struct *work)
347{ 348{
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
426 "record-cmd", 427 "record-cmd",
427 "overwrite", 428 "overwrite",
428 "disable_on_free", 429 "disable_on_free",
430 "irq-info",
429 NULL 431 NULL
430}; 432};
431 433
@@ -435,6 +437,7 @@ static struct {
435} trace_clocks[] = { 437} trace_clocks[] = {
436 { trace_clock_local, "local" }, 438 { trace_clock_local, "local" },
437 { trace_clock_global, "global" }, 439 { trace_clock_global, "global" },
440 { trace_clock_counter, "counter" },
438}; 441};
439 442
440int trace_clock_id; 443int trace_clock_id;
@@ -960,7 +963,7 @@ void tracing_start(void)
960 if (tracing_disabled) 963 if (tracing_disabled)
961 return; 964 return;
962 965
963 spin_lock_irqsave(&tracing_start_lock, flags); 966 raw_spin_lock_irqsave(&tracing_start_lock, flags);
964 if (--trace_stop_count) { 967 if (--trace_stop_count) {
965 if (trace_stop_count < 0) { 968 if (trace_stop_count < 0) {
966 /* Someone screwed up their debugging */ 969 /* Someone screwed up their debugging */
@@ -985,7 +988,7 @@ void tracing_start(void)
985 988
986 ftrace_start(); 989 ftrace_start();
987 out: 990 out:
988 spin_unlock_irqrestore(&tracing_start_lock, flags); 991 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
989} 992}
990 993
991/** 994/**
@@ -1000,7 +1003,7 @@ void tracing_stop(void)
1000 unsigned long flags; 1003 unsigned long flags;
1001 1004
1002 ftrace_stop(); 1005 ftrace_stop();
1003 spin_lock_irqsave(&tracing_start_lock, flags); 1006 raw_spin_lock_irqsave(&tracing_start_lock, flags);
1004 if (trace_stop_count++) 1007 if (trace_stop_count++)
1005 goto out; 1008 goto out;
1006 1009
@@ -1018,7 +1021,7 @@ void tracing_stop(void)
1018 arch_spin_unlock(&ftrace_max_lock); 1021 arch_spin_unlock(&ftrace_max_lock);
1019 1022
1020 out: 1023 out:
1021 spin_unlock_irqrestore(&tracing_start_lock, flags); 1024 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
1022} 1025}
1023 1026
1024void trace_stop_cmdline_recording(void); 1027void trace_stop_cmdline_recording(void);
@@ -1842,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
1842 trace_event_read_unlock(); 1845 trace_event_read_unlock();
1843} 1846}
1844 1847
1848static void
1849get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
1850{
1851 unsigned long count;
1852 int cpu;
1853
1854 *total = 0;
1855 *entries = 0;
1856
1857 for_each_tracing_cpu(cpu) {
1858 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1859 /*
1860 * If this buffer has skipped entries, then we hold all
1861 * entries for the trace and we need to ignore the
1862 * ones before the time stamp.
1863 */
1864 if (tr->data[cpu]->skipped_entries) {
1865 count -= tr->data[cpu]->skipped_entries;
1866 /* total is the same as the entries */
1867 *total += count;
1868 } else
1869 *total += count +
1870 ring_buffer_overrun_cpu(tr->buffer, cpu);
1871 *entries += count;
1872 }
1873}
1874
1845static void print_lat_help_header(struct seq_file *m) 1875static void print_lat_help_header(struct seq_file *m)
1846{ 1876{
1847 seq_puts(m, "# _------=> CPU# \n"); 1877 seq_puts(m, "# _------=> CPU# \n");
@@ -1854,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
1854 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1884 seq_puts(m, "# \\ / ||||| \\ | / \n");
1855} 1885}
1856 1886
1857static void print_func_help_header(struct seq_file *m) 1887static void print_event_info(struct trace_array *tr, struct seq_file *m)
1888{
1889 unsigned long total;
1890 unsigned long entries;
1891
1892 get_total_entries(tr, &total, &entries);
1893 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
1894 entries, total, num_online_cpus());
1895 seq_puts(m, "#\n");
1896}
1897
1898static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
1858{ 1899{
1859 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1900 print_event_info(tr, m);
1901 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1860 seq_puts(m, "# | | | | |\n"); 1902 seq_puts(m, "# | | | | |\n");
1861} 1903}
1862 1904
1905static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
1906{
1907 print_event_info(tr, m);
1908 seq_puts(m, "# _-----=> irqs-off\n");
1909 seq_puts(m, "# / _----=> need-resched\n");
1910 seq_puts(m, "# | / _---=> hardirq/softirq\n");
1911 seq_puts(m, "# || / _--=> preempt-depth\n");
1912 seq_puts(m, "# ||| / delay\n");
1913 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
1914 seq_puts(m, "# | | | |||| | |\n");
1915}
1863 1916
1864void 1917void
1865print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1918print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1868,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1868 struct trace_array *tr = iter->tr; 1921 struct trace_array *tr = iter->tr;
1869 struct trace_array_cpu *data = tr->data[tr->cpu]; 1922 struct trace_array_cpu *data = tr->data[tr->cpu];
1870 struct tracer *type = current_trace; 1923 struct tracer *type = current_trace;
1871 unsigned long entries = 0; 1924 unsigned long entries;
1872 unsigned long total = 0; 1925 unsigned long total;
1873 unsigned long count;
1874 const char *name = "preemption"; 1926 const char *name = "preemption";
1875 int cpu;
1876 1927
1877 if (type) 1928 if (type)
1878 name = type->name; 1929 name = type->name;
1879 1930
1880 1931 get_total_entries(tr, &total, &entries);
1881 for_each_tracing_cpu(cpu) {
1882 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1883 /*
1884 * If this buffer has skipped entries, then we hold all
1885 * entries for the trace and we need to ignore the
1886 * ones before the time stamp.
1887 */
1888 if (tr->data[cpu]->skipped_entries) {
1889 count -= tr->data[cpu]->skipped_entries;
1890 /* total is the same as the entries */
1891 total += count;
1892 } else
1893 total += count +
1894 ring_buffer_overrun_cpu(tr->buffer, cpu);
1895 entries += count;
1896 }
1897 1932
1898 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1933 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1899 name, UTS_RELEASE); 1934 name, UTS_RELEASE);
@@ -2139,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2139 return print_trace_fmt(iter); 2174 return print_trace_fmt(iter);
2140} 2175}
2141 2176
2177void trace_latency_header(struct seq_file *m)
2178{
2179 struct trace_iterator *iter = m->private;
2180
2181 /* print nothing if the buffers are empty */
2182 if (trace_empty(iter))
2183 return;
2184
2185 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2186 print_trace_header(m, iter);
2187
2188 if (!(trace_flags & TRACE_ITER_VERBOSE))
2189 print_lat_help_header(m);
2190}
2191
2142void trace_default_header(struct seq_file *m) 2192void trace_default_header(struct seq_file *m)
2143{ 2193{
2144 struct trace_iterator *iter = m->private; 2194 struct trace_iterator *iter = m->private;
@@ -2154,11 +2204,23 @@ void trace_default_header(struct seq_file *m)
2154 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2204 if (!(trace_flags & TRACE_ITER_VERBOSE))
2155 print_lat_help_header(m); 2205 print_lat_help_header(m);
2156 } else { 2206 } else {
2157 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2207 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2158 print_func_help_header(m); 2208 if (trace_flags & TRACE_ITER_IRQ_INFO)
2209 print_func_help_header_irq(iter->tr, m);
2210 else
2211 print_func_help_header(iter->tr, m);
2212 }
2159 } 2213 }
2160} 2214}
2161 2215
2216static void test_ftrace_alive(struct seq_file *m)
2217{
2218 if (!ftrace_is_dead())
2219 return;
2220 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
2221 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2222}
2223
2162static int s_show(struct seq_file *m, void *v) 2224static int s_show(struct seq_file *m, void *v)
2163{ 2225{
2164 struct trace_iterator *iter = v; 2226 struct trace_iterator *iter = v;
@@ -2168,6 +2230,7 @@ static int s_show(struct seq_file *m, void *v)
2168 if (iter->tr) { 2230 if (iter->tr) {
2169 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2231 seq_printf(m, "# tracer: %s\n", iter->trace->name);
2170 seq_puts(m, "#\n"); 2232 seq_puts(m, "#\n");
2233 test_ftrace_alive(m);
2171 } 2234 }
2172 if (iter->trace && iter->trace->print_header) 2235 if (iter->trace && iter->trace->print_header)
2173 iter->trace->print_header(m); 2236 iter->trace->print_header(m);
@@ -2710,9 +2773,9 @@ static const char readme_msg[] =
2710 "# cat /sys/kernel/debug/tracing/trace_options\n" 2773 "# cat /sys/kernel/debug/tracing/trace_options\n"
2711 "noprint-parent nosym-offset nosym-addr noverbose\n" 2774 "noprint-parent nosym-offset nosym-addr noverbose\n"
2712 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2713 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" 2776 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
2714 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 2777 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2715 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" 2778 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
2716; 2779;
2717 2780
2718static ssize_t 2781static ssize_t
@@ -3569,6 +3632,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3569} 3632}
3570 3633
3571static ssize_t 3634static ssize_t
3635tracing_total_entries_read(struct file *filp, char __user *ubuf,
3636 size_t cnt, loff_t *ppos)
3637{
3638 struct trace_array *tr = filp->private_data;
3639 char buf[64];
3640 int r, cpu;
3641 unsigned long size = 0, expanded_size = 0;
3642
3643 mutex_lock(&trace_types_lock);
3644 for_each_tracing_cpu(cpu) {
3645 size += tr->entries >> 10;
3646 if (!ring_buffer_expanded)
3647 expanded_size += trace_buf_size >> 10;
3648 }
3649 if (ring_buffer_expanded)
3650 r = sprintf(buf, "%lu\n", size);
3651 else
3652 r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
3653 mutex_unlock(&trace_types_lock);
3654
3655 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3656}
3657
3658static ssize_t
3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf, 3659tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos) 3660 size_t cnt, loff_t *ppos)
3574{ 3661{
@@ -3594,22 +3681,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3594 return 0; 3681 return 0;
3595} 3682}
3596 3683
3597static int mark_printk(const char *fmt, ...)
3598{
3599 int ret;
3600 va_list args;
3601 va_start(args, fmt);
3602 ret = trace_vprintk(0, fmt, args);
3603 va_end(args);
3604 return ret;
3605}
3606
3607static ssize_t 3684static ssize_t
3608tracing_mark_write(struct file *filp, const char __user *ubuf, 3685tracing_mark_write(struct file *filp, const char __user *ubuf,
3609 size_t cnt, loff_t *fpos) 3686 size_t cnt, loff_t *fpos)
3610{ 3687{
3611 char *buf; 3688 unsigned long addr = (unsigned long)ubuf;
3612 size_t written; 3689 struct ring_buffer_event *event;
3690 struct ring_buffer *buffer;
3691 struct print_entry *entry;
3692 unsigned long irq_flags;
3693 struct page *pages[2];
3694 int nr_pages = 1;
3695 ssize_t written;
3696 void *page1;
3697 void *page2;
3698 int offset;
3699 int size;
3700 int len;
3701 int ret;
3613 3702
3614 if (tracing_disabled) 3703 if (tracing_disabled)
3615 return -EINVAL; 3704 return -EINVAL;
@@ -3617,28 +3706,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3617 if (cnt > TRACE_BUF_SIZE) 3706 if (cnt > TRACE_BUF_SIZE)
3618 cnt = TRACE_BUF_SIZE; 3707 cnt = TRACE_BUF_SIZE;
3619 3708
3620 buf = kmalloc(cnt + 2, GFP_KERNEL); 3709 /*
3621 if (buf == NULL) 3710 * Userspace is injecting traces into the kernel trace buffer.
3622 return -ENOMEM; 3711 * We want to be as non intrusive as possible.
3712 * To do so, we do not want to allocate any special buffers
3713 * or take any locks, but instead write the userspace data
3714 * straight into the ring buffer.
3715 *
3716 * First we need to pin the userspace buffer into memory,
3717 * which, most likely it is, because it just referenced it.
3718 * But there's no guarantee that it is. By using get_user_pages_fast()
3719 * and kmap_atomic/kunmap_atomic() we can get access to the
3720 * pages directly. We then write the data directly into the
3721 * ring buffer.
3722 */
3723 BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
3623 3724
3624 if (copy_from_user(buf, ubuf, cnt)) { 3725 /* check if we cross pages */
3625 kfree(buf); 3726 if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
3626 return -EFAULT; 3727 nr_pages = 2;
3728
3729 offset = addr & (PAGE_SIZE - 1);
3730 addr &= PAGE_MASK;
3731
3732 ret = get_user_pages_fast(addr, nr_pages, 0, pages);
3733 if (ret < nr_pages) {
3734 while (--ret >= 0)
3735 put_page(pages[ret]);
3736 written = -EFAULT;
3737 goto out;
3627 } 3738 }
3628 if (buf[cnt-1] != '\n') { 3739
3629 buf[cnt] = '\n'; 3740 page1 = kmap_atomic(pages[0]);
3630 buf[cnt+1] = '\0'; 3741 if (nr_pages == 2)
3742 page2 = kmap_atomic(pages[1]);
3743
3744 local_save_flags(irq_flags);
3745 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3746 buffer = global_trace.buffer;
3747 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3748 irq_flags, preempt_count());
3749 if (!event) {
3750 /* Ring buffer disabled, return as if not open for write */
3751 written = -EBADF;
3752 goto out_unlock;
3753 }
3754
3755 entry = ring_buffer_event_data(event);
3756 entry->ip = _THIS_IP_;
3757
3758 if (nr_pages == 2) {
3759 len = PAGE_SIZE - offset;
3760 memcpy(&entry->buf, page1 + offset, len);
3761 memcpy(&entry->buf[len], page2, cnt - len);
3631 } else 3762 } else
3632 buf[cnt] = '\0'; 3763 memcpy(&entry->buf, page1 + offset, cnt);
3633 3764
3634 written = mark_printk("%s", buf); 3765 if (entry->buf[cnt - 1] != '\n') {
3635 kfree(buf); 3766 entry->buf[cnt] = '\n';
3636 *fpos += written; 3767 entry->buf[cnt + 1] = '\0';
3768 } else
3769 entry->buf[cnt] = '\0';
3770
3771 ring_buffer_unlock_commit(buffer, event);
3772
3773 written = cnt;
3637 3774
3638 /* don't tell userspace we wrote more - it might confuse them */ 3775 *fpos += written;
3639 if (written > cnt)
3640 written = cnt;
3641 3776
3777 out_unlock:
3778 if (nr_pages == 2)
3779 kunmap_atomic(page2);
3780 kunmap_atomic(page1);
3781 while (nr_pages > 0)
3782 put_page(pages[--nr_pages]);
3783 out:
3642 return written; 3784 return written;
3643} 3785}
3644 3786
@@ -3739,6 +3881,12 @@ static const struct file_operations tracing_entries_fops = {
3739 .llseek = generic_file_llseek, 3881 .llseek = generic_file_llseek,
3740}; 3882};
3741 3883
3884static const struct file_operations tracing_total_entries_fops = {
3885 .open = tracing_open_generic,
3886 .read = tracing_total_entries_read,
3887 .llseek = generic_file_llseek,
3888};
3889
3742static const struct file_operations tracing_free_buffer_fops = { 3890static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write, 3891 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release, 3892 .release = tracing_free_buffer_release,
@@ -3808,8 +3956,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3808 if (info->read < PAGE_SIZE) 3956 if (info->read < PAGE_SIZE)
3809 goto read; 3957 goto read;
3810 3958
3811 info->read = 0;
3812
3813 trace_access_lock(info->cpu); 3959 trace_access_lock(info->cpu);
3814 ret = ring_buffer_read_page(info->tr->buffer, 3960 ret = ring_buffer_read_page(info->tr->buffer,
3815 &info->spare, 3961 &info->spare,
@@ -3819,6 +3965,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3819 if (ret < 0) 3965 if (ret < 0)
3820 return 0; 3966 return 0;
3821 3967
3968 info->read = 0;
3969
3822read: 3970read:
3823 size = PAGE_SIZE - info->read; 3971 size = PAGE_SIZE - info->read;
3824 if (size > count) 3972 if (size > count)
@@ -4026,6 +4174,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4026 struct trace_array *tr = &global_trace; 4174 struct trace_array *tr = &global_trace;
4027 struct trace_seq *s; 4175 struct trace_seq *s;
4028 unsigned long cnt; 4176 unsigned long cnt;
4177 unsigned long long t;
4178 unsigned long usec_rem;
4029 4179
4030 s = kmalloc(sizeof(*s), GFP_KERNEL); 4180 s = kmalloc(sizeof(*s), GFP_KERNEL);
4031 if (!s) 4181 if (!s)
@@ -4042,6 +4192,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4042 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 4192 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
4043 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 4193 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4044 4194
4195 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4196 trace_seq_printf(s, "bytes: %ld\n", cnt);
4197
4198 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4199 usec_rem = do_div(t, USEC_PER_SEC);
4200 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
4201
4202 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4203 usec_rem = do_div(t, USEC_PER_SEC);
4204 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4205
4045 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4206 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4046 4207
4047 kfree(s); 4208 kfree(s);
@@ -4277,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
4277}; 4438};
4278 4439
4279struct dentry *trace_create_file(const char *name, 4440struct dentry *trace_create_file(const char *name,
4280 mode_t mode, 4441 umode_t mode,
4281 struct dentry *parent, 4442 struct dentry *parent,
4282 void *data, 4443 void *data,
4283 const struct file_operations *fops) 4444 const struct file_operations *fops)
@@ -4450,6 +4611,9 @@ static __init int tracer_init_debugfs(void)
4450 trace_create_file("buffer_size_kb", 0644, d_tracer, 4611 trace_create_file("buffer_size_kb", 0644, d_tracer,
4451 &global_trace, &tracing_entries_fops); 4612 &global_trace, &tracing_entries_fops);
4452 4613
4614 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4615 &global_trace, &tracing_total_entries_fops);
4616
4453 trace_create_file("free_buffer", 0644, d_tracer, 4617 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops); 4618 &global_trace, &tracing_free_buffer_fops);
4455 4619
@@ -4566,6 +4730,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4566 4730
4567 tracing_off(); 4731 tracing_off();
4568 4732
4733 /* Did function tracer already get disabled? */
4734 if (ftrace_is_dead()) {
4735 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
4736 printk("# MAY BE MISSING FUNCTION EVENTS\n");
4737 }
4738
4569 if (disable_tracing) 4739 if (disable_tracing)
4570 ftrace_kill(); 4740 ftrace_kill();
4571 4741
@@ -4658,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4658{ 4828{
4659 __ftrace_dump(true, oops_dump_mode); 4829 __ftrace_dump(true, oops_dump_mode);
4660} 4830}
4831EXPORT_SYMBOL_GPL(ftrace_dump);
4661 4832
4662__init static int tracer_alloc_buffers(void) 4833__init static int tracer_alloc_buffers(void)
4663{ 4834{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee5..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
312void tracing_reset_current_online_cpus(void); 312void tracing_reset_current_online_cpus(void);
313int tracing_open_generic(struct inode *inode, struct file *filp); 313int tracing_open_generic(struct inode *inode, struct file *filp);
314struct dentry *trace_create_file(const char *name, 314struct dentry *trace_create_file(const char *name,
315 mode_t mode, 315 umode_t mode,
316 struct dentry *parent, 316 struct dentry *parent,
317 void *data, 317 void *data,
318 const struct file_operations *fops); 318 const struct file_operations *fops);
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
370 unsigned long ip, 370 unsigned long ip,
371 unsigned long parent_ip, 371 unsigned long parent_ip,
372 unsigned long flags, int pc); 372 unsigned long flags, int pc);
373void trace_latency_header(struct seq_file *m);
373void trace_default_header(struct seq_file *m); 374void trace_default_header(struct seq_file *m);
374void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 375void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
375int trace_empty(struct trace_iterator *iter); 376int trace_empty(struct trace_iterator *iter);
@@ -579,11 +580,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
579 580
580 return test_tsk_trace_trace(task); 581 return test_tsk_trace_trace(task);
581} 582}
583extern int ftrace_is_dead(void);
582#else 584#else
583static inline int ftrace_trace_task(struct task_struct *task) 585static inline int ftrace_trace_task(struct task_struct *task)
584{ 586{
585 return 1; 587 return 1;
586} 588}
589static inline int ftrace_is_dead(void) { return 0; }
587#endif 590#endif
588 591
589/* 592/*
@@ -652,6 +655,7 @@ enum trace_iterator_flags {
652 TRACE_ITER_RECORD_CMD = 0x100000, 655 TRACE_ITER_RECORD_CMD = 0x100000,
653 TRACE_ITER_OVERWRITE = 0x200000, 656 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000, 657 TRACE_ITER_STOP_ON_FREE = 0x400000,
658 TRACE_ITER_IRQ_INFO = 0x800000,
655}; 659};
656 660
657/* 661/*
@@ -761,16 +765,10 @@ struct filter_pred {
761 filter_pred_fn_t fn; 765 filter_pred_fn_t fn;
762 u64 val; 766 u64 val;
763 struct regex regex; 767 struct regex regex;
764 /* 768 unsigned short *ops;
765 * Leaf nodes use field_name, ops is used by AND and OR 769#ifdef CONFIG_FTRACE_STARTUP_TEST
766 * nodes. The field_name is always freed when freeing a pred. 770 struct ftrace_event_field *field;
767 * We can overload field_name for ops and have it freed 771#endif
768 * as well.
769 */
770 union {
771 char *field_name;
772 unsigned short *ops;
773 };
774 int offset; 772 int offset;
775 int not; 773 int not;
776 int op; 774 int op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a1398..394783531cbb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
113 113
114 return now; 114 return now;
115} 115}
116
117static atomic64_t trace_counter;
118
119/*
120 * trace_clock_counter(): simply an atomic counter.
121 * Use the trace_counter "counter" for cases where you do not care
122 * about timings, but are interested in strict ordering.
123 */
124u64 notrace trace_clock_counter(void)
125{
126 return atomic64_add_return(1, &trace_counter);
127}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 581876f9f387..c212a7f934ec 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1078 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1079 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1080 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1082 system->nr_events++; 1081 system->nr_events++;
1083 return system->entry; 1082 return system->entry;
1084 } 1083 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd6..f04cc3136bd3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
30enum filter_op_ids 36enum filter_op_ids
31{ 37{
32 OP_OR, 38 OP_OR,
@@ -381,6 +387,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
381 return pred; 387 return pred;
382} 388}
383 389
390enum walk_return {
391 WALK_PRED_ABORT,
392 WALK_PRED_PARENT,
393 WALK_PRED_DEFAULT,
394};
395
396typedef int (*filter_pred_walkcb_t) (enum move_type move,
397 struct filter_pred *pred,
398 int *err, void *data);
399
400static int walk_pred_tree(struct filter_pred *preds,
401 struct filter_pred *root,
402 filter_pred_walkcb_t cb, void *data)
403{
404 struct filter_pred *pred = root;
405 enum move_type move = MOVE_DOWN;
406 int done = 0;
407
408 if (!preds)
409 return -EINVAL;
410
411 do {
412 int err = 0, ret;
413
414 ret = cb(move, pred, &err, data);
415 if (ret == WALK_PRED_ABORT)
416 return err;
417 if (ret == WALK_PRED_PARENT)
418 goto get_parent;
419
420 switch (move) {
421 case MOVE_DOWN:
422 if (pred->left != FILTER_PRED_INVALID) {
423 pred = &preds[pred->left];
424 continue;
425 }
426 goto get_parent;
427 case MOVE_UP_FROM_LEFT:
428 pred = &preds[pred->right];
429 move = MOVE_DOWN;
430 continue;
431 case MOVE_UP_FROM_RIGHT:
432 get_parent:
433 if (pred == root)
434 break;
435 pred = get_pred_parent(pred, preds,
436 pred->parent,
437 &move);
438 continue;
439 }
440 done = 1;
441 } while (!done);
442
443 /* We are fine. */
444 return 0;
445}
446
384/* 447/*
385 * A series of AND or ORs where found together. Instead of 448 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the 449 * climbing up and down the tree branches, an array of the
@@ -410,99 +473,91 @@ static int process_ops(struct filter_pred *preds,
410 473
411 for (i = 0; i < op->val; i++) { 474 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]]; 475 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec); 476 if (!WARN_ON_ONCE(!pred->fn))
477 match = pred->fn(pred, rec);
414 if (!!match == type) 478 if (!!match == type)
415 return match; 479 return match;
416 } 480 }
417 return match; 481 return match;
418} 482}
419 483
484struct filter_match_preds_data {
485 struct filter_pred *preds;
486 int match;
487 void *rec;
488};
489
490static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
491 int *err, void *data)
492{
493 struct filter_match_preds_data *d = data;
494
495 *err = 0;
496 switch (move) {
497 case MOVE_DOWN:
498 /* only AND and OR have children */
499 if (pred->left != FILTER_PRED_INVALID) {
500 /* If ops is set, then it was folded. */
501 if (!pred->ops)
502 return WALK_PRED_DEFAULT;
503 /* We can treat folded ops as a leaf node */
504 d->match = process_ops(d->preds, pred, d->rec);
505 } else {
506 if (!WARN_ON_ONCE(!pred->fn))
507 d->match = pred->fn(pred, d->rec);
508 }
509
510 return WALK_PRED_PARENT;
511 case MOVE_UP_FROM_LEFT:
512 /*
513 * Check for short circuits.
514 *
515 * Optimization: !!match == (pred->op == OP_OR)
516 * is the same as:
517 * if ((match && pred->op == OP_OR) ||
518 * (!match && pred->op == OP_AND))
519 */
520 if (!!d->match == (pred->op == OP_OR))
521 return WALK_PRED_PARENT;
522 break;
523 case MOVE_UP_FROM_RIGHT:
524 break;
525 }
526
527 return WALK_PRED_DEFAULT;
528}
529
420/* return 1 if event matches, 0 otherwise (discard) */ 530/* return 1 if event matches, 0 otherwise (discard) */
421int filter_match_preds(struct event_filter *filter, void *rec) 531int filter_match_preds(struct event_filter *filter, void *rec)
422{ 532{
423 int match = -1;
424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds; 533 struct filter_pred *preds;
426 struct filter_pred *pred;
427 struct filter_pred *root; 534 struct filter_pred *root;
428 int n_preds; 535 struct filter_match_preds_data data = {
429 int done = 0; 536 /* match is currently meaningless */
537 .match = -1,
538 .rec = rec,
539 };
540 int n_preds, ret;
430 541
431 /* no filter is considered a match */ 542 /* no filter is considered a match */
432 if (!filter) 543 if (!filter)
433 return 1; 544 return 1;
434 545
435 n_preds = filter->n_preds; 546 n_preds = filter->n_preds;
436
437 if (!n_preds) 547 if (!n_preds)
438 return 1; 548 return 1;
439 549
440 /* 550 /*
441 * n_preds, root and filter->preds are protect with preemption disabled. 551 * n_preds, root and filter->preds are protect with preemption disabled.
442 */ 552 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root); 553 root = rcu_dereference_sched(filter->root);
445 if (!root) 554 if (!root)
446 return 1; 555 return 1;
447 556
448 pred = root; 557 data.preds = preds = rcu_dereference_sched(filter->preds);
449 558 ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
450 /* match is currently meaningless */ 559 WARN_ON(ret);
451 match = -1; 560 return data.match;
452
453 do {
454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
500 continue;
501 }
502 done = 1;
503 } while (!done);
504
505 return match;
506} 561}
507EXPORT_SYMBOL_GPL(filter_match_preds); 562EXPORT_SYMBOL_GPL(filter_match_preds);
508 563
@@ -597,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
597 if (filter && filter->filter_string) 652 if (filter && filter->filter_string)
598 trace_seq_printf(s, "%s\n", filter->filter_string); 653 trace_seq_printf(s, "%s\n", filter->filter_string);
599 else 654 else
600 trace_seq_printf(s, "none\n"); 655 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
601 mutex_unlock(&event_mutex); 656 mutex_unlock(&event_mutex);
602} 657}
603 658
@@ -628,22 +683,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
628 return __find_event_field(head, name); 683 return __find_event_field(head, name);
629} 684}
630 685
631static void filter_free_pred(struct filter_pred *pred)
632{
633 if (!pred)
634 return;
635
636 kfree(pred->field_name);
637 kfree(pred);
638}
639
640static void filter_clear_pred(struct filter_pred *pred)
641{
642 kfree(pred->field_name);
643 pred->field_name = NULL;
644 pred->regex.len = 0;
645}
646
647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 686static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{ 687{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 688 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +728,13 @@ __pop_pred_stack(struct pred_stack *stack)
689static int filter_set_pred(struct event_filter *filter, 728static int filter_set_pred(struct event_filter *filter,
690 int idx, 729 int idx,
691 struct pred_stack *stack, 730 struct pred_stack *stack,
692 struct filter_pred *src, 731 struct filter_pred *src)
693 filter_pred_fn_t fn)
694{ 732{
695 struct filter_pred *dest = &filter->preds[idx]; 733 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left; 734 struct filter_pred *left;
697 struct filter_pred *right; 735 struct filter_pred *right;
698 736
699 *dest = *src; 737 *dest = *src;
700 if (src->field_name) {
701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
702 if (!dest->field_name)
703 return -ENOMEM;
704 }
705 dest->fn = fn;
706 dest->index = idx; 738 dest->index = idx;
707 739
708 if (dest->op == OP_OR || dest->op == OP_AND) { 740 if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +775,7 @@ static int filter_set_pred(struct event_filter *filter,
743 775
744static void __free_preds(struct event_filter *filter) 776static void __free_preds(struct event_filter *filter)
745{ 777{
746 int i;
747
748 if (filter->preds) { 778 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds); 779 kfree(filter->preds);
752 filter->preds = NULL; 780 filter->preds = NULL;
753 } 781 }
@@ -840,23 +868,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
840 } 868 }
841} 869}
842 870
843static int filter_add_pred_fn(struct filter_parse_state *ps, 871static int filter_add_pred(struct filter_parse_state *ps,
844 struct ftrace_event_call *call, 872 struct event_filter *filter,
845 struct event_filter *filter, 873 struct filter_pred *pred,
846 struct filter_pred *pred, 874 struct pred_stack *stack)
847 struct pred_stack *stack,
848 filter_pred_fn_t fn)
849{ 875{
850 int idx, err; 876 int err;
851 877
852 if (WARN_ON(filter->n_preds == filter->a_preds)) { 878 if (WARN_ON(filter->n_preds == filter->a_preds)) {
853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 879 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
854 return -ENOSPC; 880 return -ENOSPC;
855 } 881 }
856 882
857 idx = filter->n_preds; 883 err = filter_set_pred(filter, filter->n_preds, stack, pred);
858 filter_clear_pred(&filter->preds[idx]);
859 err = filter_set_pred(filter, idx, stack, pred, fn);
860 if (err) 884 if (err)
861 return err; 885 return err;
862 886
@@ -937,31 +961,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
937 return fn; 961 return fn;
938} 962}
939 963
940static int filter_add_pred(struct filter_parse_state *ps, 964static int init_pred(struct filter_parse_state *ps,
941 struct ftrace_event_call *call, 965 struct ftrace_event_field *field,
942 struct event_filter *filter, 966 struct filter_pred *pred)
943 struct filter_pred *pred, 967
944 struct pred_stack *stack,
945 bool dry_run)
946{ 968{
947 struct ftrace_event_field *field; 969 filter_pred_fn_t fn = filter_pred_none;
948 filter_pred_fn_t fn;
949 unsigned long long val; 970 unsigned long long val;
950 int ret; 971 int ret;
951 972
952 fn = pred->fn = filter_pred_none;
953
954 if (pred->op == OP_AND)
955 goto add_pred_fn;
956 else if (pred->op == OP_OR)
957 goto add_pred_fn;
958
959 field = find_event_field(call, pred->field_name);
960 if (!field) {
961 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
962 return -EINVAL;
963 }
964
965 pred->offset = field->offset; 973 pred->offset = field->offset;
966 974
967 if (!is_legal_op(field, pred->op)) { 975 if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1009,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
1001 if (pred->op == OP_NE) 1009 if (pred->op == OP_NE)
1002 pred->not = 1; 1010 pred->not = 1;
1003 1011
1004add_pred_fn: 1012 pred->fn = fn;
1005 if (!dry_run)
1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
1007 return 0; 1013 return 0;
1008} 1014}
1009 1015
@@ -1302,39 +1308,37 @@ parse_operand:
1302 return 0; 1308 return 0;
1303} 1309}
1304 1310
1305static struct filter_pred *create_pred(int op, char *operand1, char *operand2) 1311static struct filter_pred *create_pred(struct filter_parse_state *ps,
1312 struct ftrace_event_call *call,
1313 int op, char *operand1, char *operand2)
1306{ 1314{
1307 struct filter_pred *pred; 1315 struct ftrace_event_field *field;
1316 static struct filter_pred pred;
1308 1317
1309 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 1318 memset(&pred, 0, sizeof(pred));
1310 if (!pred) 1319 pred.op = op;
1311 return NULL;
1312 1320
1313 pred->field_name = kstrdup(operand1, GFP_KERNEL); 1321 if (op == OP_AND || op == OP_OR)
1314 if (!pred->field_name) { 1322 return &pred;
1315 kfree(pred); 1323
1324 if (!operand1 || !operand2) {
1325 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1316 return NULL; 1326 return NULL;
1317 } 1327 }
1318 1328
1319 strcpy(pred->regex.pattern, operand2); 1329 field = find_event_field(call, operand1);
1320 pred->regex.len = strlen(pred->regex.pattern); 1330 if (!field) {
1321 1331 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1322 pred->op = op;
1323
1324 return pred;
1325}
1326
1327static struct filter_pred *create_logical_pred(int op)
1328{
1329 struct filter_pred *pred;
1330
1331 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1332 if (!pred)
1333 return NULL; 1332 return NULL;
1333 }
1334 1334
1335 pred->op = op; 1335 strcpy(pred.regex.pattern, operand2);
1336 pred.regex.len = strlen(pred.regex.pattern);
1336 1337
1337 return pred; 1338#ifdef CONFIG_FTRACE_STARTUP_TEST
1339 pred.field = field;
1340#endif
1341 return init_pred(ps, field, &pred) ? NULL : &pred;
1338} 1342}
1339 1343
1340static int check_preds(struct filter_parse_state *ps) 1344static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1379,23 @@ static int count_preds(struct filter_parse_state *ps)
1375 return n_preds; 1379 return n_preds;
1376} 1380}
1377 1381
1382struct check_pred_data {
1383 int count;
1384 int max;
1385};
1386
1387static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1388 int *err, void *data)
1389{
1390 struct check_pred_data *d = data;
1391
1392 if (WARN_ON(d->count++ > d->max)) {
1393 *err = -EINVAL;
1394 return WALK_PRED_ABORT;
1395 }
1396 return WALK_PRED_DEFAULT;
1397}
1398
1378/* 1399/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly 1400 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does 1401 * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1404,76 @@ static int count_preds(struct filter_parse_state *ps)
1383static int check_pred_tree(struct event_filter *filter, 1404static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root) 1405 struct filter_pred *root)
1385{ 1406{
1386 struct filter_pred *preds; 1407 struct check_pred_data data = {
1387 struct filter_pred *pred; 1408 /*
1388 enum move_type move = MOVE_DOWN; 1409 * The max that we can hit a node is three times.
1389 int count = 0; 1410 * Once going down, once coming up from left, and
1390 int done = 0; 1411 * once coming up from right. This is more than enough
1391 int max; 1412 * since leafs are only hit a single time.
1392 1413 */
1393 /* 1414 .max = 3 * filter->n_preds,
1394 * The max that we can hit a node is three times. 1415 .count = 0,
1395 * Once going down, once coming up from left, and 1416 };
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400 1417
1401 preds = filter->preds; 1418 return walk_pred_tree(filter->preds, root,
1402 if (!preds) 1419 check_pred_tree_cb, &data);
1403 return -EINVAL; 1420}
1404 pred = root;
1405 1421
1406 do { 1422static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
1407 if (WARN_ON(count++ > max)) 1423 int *err, void *data)
1408 return -EINVAL; 1424{
1425 int *count = data;
1409 1426
1410 switch (move) { 1427 if ((move == MOVE_DOWN) &&
1411 case MOVE_DOWN: 1428 (pred->left == FILTER_PRED_INVALID))
1412 if (pred->left != FILTER_PRED_INVALID) { 1429 (*count)++;
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435 1430
1436 /* We are fine. */ 1431 return WALK_PRED_DEFAULT;
1437 return 0;
1438} 1432}
1439 1433
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root) 1434static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{ 1435{
1442 struct filter_pred *pred; 1436 int count = 0, ret;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446 1437
1447 pred = root; 1438 ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
1439 WARN_ON(ret);
1440 return count;
1441}
1448 1442
1449 do { 1443struct fold_pred_data {
1450 switch (move) { 1444 struct filter_pred *root;
1451 case MOVE_DOWN: 1445 int count;
1452 if (pred->left != FILTER_PRED_INVALID) { 1446 int children;
1453 pred = &preds[pred->left]; 1447};
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476 1448
1477 return count; 1449static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
1450 int *err, void *data)
1451{
1452 struct fold_pred_data *d = data;
1453 struct filter_pred *root = d->root;
1454
1455 if (move != MOVE_DOWN)
1456 return WALK_PRED_DEFAULT;
1457 if (pred->left != FILTER_PRED_INVALID)
1458 return WALK_PRED_DEFAULT;
1459
1460 if (WARN_ON(d->count == d->children)) {
1461 *err = -EINVAL;
1462 return WALK_PRED_ABORT;
1463 }
1464
1465 pred->index &= ~FILTER_PRED_FOLD;
1466 root->ops[d->count++] = pred->index;
1467 return WALK_PRED_DEFAULT;
1478} 1468}
1479 1469
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root) 1470static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{ 1471{
1482 struct filter_pred *pred; 1472 struct fold_pred_data data = {
1483 enum move_type move = MOVE_DOWN; 1473 .root = root,
1484 int count = 0; 1474 .count = 0,
1475 };
1485 int children; 1476 int children;
1486 int done = 0;
1487 1477
1488 /* No need to keep the fold flag */ 1478 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD; 1479 root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1491,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1501 return -ENOMEM; 1491 return -ENOMEM;
1502 1492
1503 root->val = children; 1493 root->val = children;
1494 data.children = children;
1495 return walk_pred_tree(preds, root, fold_pred_cb, &data);
1496}
1504 1497
1505 pred = root; 1498static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1506 do { 1499 int *err, void *data)
1507 switch (move) { 1500{
1508 case MOVE_DOWN: 1501 struct filter_pred *preds = data;
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533 1502
1534 return 0; 1503 if (move != MOVE_DOWN)
1504 return WALK_PRED_DEFAULT;
1505 if (!(pred->index & FILTER_PRED_FOLD))
1506 return WALK_PRED_DEFAULT;
1507
1508 *err = fold_pred(preds, pred);
1509 if (*err)
1510 return WALK_PRED_ABORT;
1511
1512 /* eveyrhing below is folded, continue with parent */
1513 return WALK_PRED_PARENT;
1535} 1514}
1536 1515
1537/* 1516/*
@@ -1542,51 +1521,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1542static int fold_pred_tree(struct event_filter *filter, 1521static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root) 1522 struct filter_pred *root)
1544{ 1523{
1545 struct filter_pred *preds; 1524 return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
1546 struct filter_pred *pred; 1525 filter->preds);
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590} 1526}
1591 1527
1592static int replace_preds(struct ftrace_event_call *call, 1528static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1579,17 @@ static int replace_preds(struct ftrace_event_call *call,
1643 goto fail; 1579 goto fail;
1644 } 1580 }
1645 1581
1646 if (elt->op == OP_AND || elt->op == OP_OR) { 1582 pred = create_pred(ps, call, elt->op, operand1, operand2);
1647 pred = create_logical_pred(elt->op); 1583 if (!pred) {
1648 goto add_pred;
1649 }
1650
1651 if (!operand1 || !operand2) {
1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1653 err = -EINVAL; 1584 err = -EINVAL;
1654 goto fail; 1585 goto fail;
1655 } 1586 }
1656 1587
1657 pred = create_pred(elt->op, operand1, operand2); 1588 if (!dry_run) {
1658add_pred: 1589 err = filter_add_pred(ps, filter, pred, &stack);
1659 if (!pred) { 1590 if (err)
1660 err = -ENOMEM; 1591 goto fail;
1661 goto fail;
1662 } 1592 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1664 filter_free_pred(pred);
1665 if (err)
1666 goto fail;
1667 1593
1668 operand1 = operand2 = NULL; 1594 operand1 = operand2 = NULL;
1669 } 1595 }
@@ -1729,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system,
1729 */ 1655 */
1730 err = replace_preds(call, NULL, ps, filter_string, true); 1656 err = replace_preds(call, NULL, ps, filter_string, true);
1731 if (err) 1657 if (err)
1732 goto fail; 1658 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
1659 else
1660 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
1733 } 1661 }
1734 1662
1735 list_for_each_entry(call, &ftrace_events, list) { 1663 list_for_each_entry(call, &ftrace_events, list) {
@@ -1738,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system,
1738 if (strcmp(call->class->system, system->name) != 0) 1666 if (strcmp(call->class->system, system->name) != 0)
1739 continue; 1667 continue;
1740 1668
1669 if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
1670 continue;
1671
1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); 1672 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1742 if (!filter_item) 1673 if (!filter_item)
1743 goto fail_mem; 1674 goto fail_mem;
@@ -1766,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system,
1766 * replace the filter for the call. 1697 * replace the filter for the call.
1767 */ 1698 */
1768 filter = call->filter; 1699 filter = call->filter;
1769 call->filter = filter_item->filter; 1700 rcu_assign_pointer(call->filter, filter_item->filter);
1770 filter_item->filter = filter; 1701 filter_item->filter = filter;
1771 1702
1772 fail = false; 1703 fail = false;
@@ -1821,7 +1752,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1821 filter = call->filter; 1752 filter = call->filter;
1822 if (!filter) 1753 if (!filter)
1823 goto out_unlock; 1754 goto out_unlock;
1824 call->filter = NULL; 1755 RCU_INIT_POINTER(call->filter, NULL);
1825 /* Make sure the filter is not being used */ 1756 /* Make sure the filter is not being used */
1826 synchronize_sched(); 1757 synchronize_sched();
1827 __free_filter(filter); 1758 __free_filter(filter);
@@ -1862,7 +1793,7 @@ out:
1862 * string 1793 * string
1863 */ 1794 */
1864 tmp = call->filter; 1795 tmp = call->filter;
1865 call->filter = filter; 1796 rcu_assign_pointer(call->filter, filter);
1866 if (tmp) { 1797 if (tmp) {
1867 /* Make sure the call is done with the filter */ 1798 /* Make sure the call is done with the filter */
1868 synchronize_sched(); 1799 synchronize_sched();
@@ -1913,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1913 if (!filter) 1844 if (!filter)
1914 goto out; 1845 goto out;
1915 1846
1916 replace_filter_string(filter, filter_string); 1847 /* System filters just show a default message */
1848 kfree(filter->filter_string);
1849 filter->filter_string = NULL;
1850
1917 /* 1851 /*
1918 * No event actually uses the system filter 1852 * No event actually uses the system filter
1919 * we can free it without synchronize_sched(). 1853 * we can free it without synchronize_sched().
@@ -1923,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1923 1857
1924 parse_init(ps, filter_ops, filter_string); 1858 parse_init(ps, filter_ops, filter_string);
1925 err = filter_parse(ps); 1859 err = filter_parse(ps);
1926 if (err) { 1860 if (err)
1927 append_filter_err(ps, system->filter); 1861 goto err_filter;
1928 goto out;
1929 }
1930 1862
1931 err = replace_system_preds(system, ps, filter_string); 1863 err = replace_system_preds(system, ps, filter_string);
1932 if (err) 1864 if (err)
1933 append_filter_err(ps, system->filter); 1865 goto err_filter;
1934 1866
1935out: 1867out:
1936 filter_opstack_clear(ps); 1868 filter_opstack_clear(ps);
@@ -1940,6 +1872,11 @@ out_unlock:
1940 mutex_unlock(&event_mutex); 1872 mutex_unlock(&event_mutex);
1941 1873
1942 return err; 1874 return err;
1875
1876err_filter:
1877 replace_filter_string(filter, filter_string);
1878 append_filter_err(ps, system->filter);
1879 goto out;
1943} 1880}
1944 1881
1945#ifdef CONFIG_PERF_EVENTS 1882#ifdef CONFIG_PERF_EVENTS
@@ -1958,17 +1895,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1958 int err; 1895 int err;
1959 struct event_filter *filter; 1896 struct event_filter *filter;
1960 struct filter_parse_state *ps; 1897 struct filter_parse_state *ps;
1961 struct ftrace_event_call *call = NULL; 1898 struct ftrace_event_call *call;
1962 1899
1963 mutex_lock(&event_mutex); 1900 mutex_lock(&event_mutex);
1964 1901
1965 list_for_each_entry(call, &ftrace_events, list) { 1902 call = event->tp_event;
1966 if (call->event.type == event_id)
1967 break;
1968 }
1969 1903
1970 err = -EINVAL; 1904 err = -EINVAL;
1971 if (&call->list == &ftrace_events) 1905 if (!call)
1972 goto out_unlock; 1906 goto out_unlock;
1973 1907
1974 err = -EEXIST; 1908 err = -EEXIST;
@@ -2012,3 +1946,215 @@ out_unlock:
2012 1946
2013#endif /* CONFIG_PERF_EVENTS */ 1947#endif /* CONFIG_PERF_EVENTS */
2014 1948
1949#ifdef CONFIG_FTRACE_STARTUP_TEST
1950
1951#include <linux/types.h>
1952#include <linux/tracepoint.h>
1953
1954#define CREATE_TRACE_POINTS
1955#include "trace_events_filter_test.h"
1956
1957static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1958 struct event_filter **pfilter)
1959{
1960 struct event_filter *filter;
1961 struct filter_parse_state *ps;
1962 int err = -ENOMEM;
1963
1964 filter = __alloc_filter();
1965 if (!filter)
1966 goto out;
1967
1968 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1969 if (!ps)
1970 goto free_filter;
1971
1972 parse_init(ps, filter_ops, filter_str);
1973 err = filter_parse(ps);
1974 if (err)
1975 goto free_ps;
1976
1977 err = replace_preds(call, filter, ps, filter_str, false);
1978 if (!err)
1979 *pfilter = filter;
1980
1981 free_ps:
1982 filter_opstack_clear(ps);
1983 postfix_clear(ps);
1984 kfree(ps);
1985
1986 free_filter:
1987 if (err)
1988 __free_filter(filter);
1989
1990 out:
1991 return err;
1992}
1993
1994#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1995{ \
1996 .filter = FILTER, \
1997 .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
1998 .e = ve, .f = vf, .g = vg, .h = vh }, \
1999 .match = m, \
2000 .not_visited = nvisit, \
2001}
2002#define YES 1
2003#define NO 0
2004
2005static struct test_filter_data_t {
2006 char *filter;
2007 struct ftrace_raw_ftrace_test_filter rec;
2008 int match;
2009 char *not_visited;
2010} test_filter_data[] = {
2011#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
2012 "e == 1 && f == 1 && g == 1 && h == 1"
2013 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
2014 DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
2015 DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
2016#undef FILTER
2017#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
2018 "e == 1 || f == 1 || g == 1 || h == 1"
2019 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2020 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2021 DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
2022#undef FILTER
2023#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
2024 "(e == 1 || f == 1) && (g == 1 || h == 1)"
2025 DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
2026 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2027 DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
2028 DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
2029#undef FILTER
2030#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
2031 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2032 DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
2033 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
2034 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2035#undef FILTER
2036#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
2037 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2038 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
2039 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2040 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
2041#undef FILTER
2042#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
2043 "(e == 1 || f == 1)) && (g == 1 || h == 1)"
2044 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
2045 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2046 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
2047#undef FILTER
2048#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
2049 "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
2050 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
2051 DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2052 DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
2053#undef FILTER
2054#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
2055 "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
2056 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
2057 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2058 DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
2059};
2060
2061#undef DATA_REC
2062#undef FILTER
2063#undef YES
2064#undef NO
2065
2066#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
2067
2068static int test_pred_visited;
2069
2070static int test_pred_visited_fn(struct filter_pred *pred, void *event)
2071{
2072 struct ftrace_event_field *field = pred->field;
2073
2074 test_pred_visited = 1;
2075 printk(KERN_INFO "\npred visited %s\n", field->name);
2076 return 1;
2077}
2078
2079static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
2080 int *err, void *data)
2081{
2082 char *fields = data;
2083
2084 if ((move == MOVE_DOWN) &&
2085 (pred->left == FILTER_PRED_INVALID)) {
2086 struct ftrace_event_field *field = pred->field;
2087
2088 if (!field) {
2089 WARN(1, "all leafs should have field defined");
2090 return WALK_PRED_DEFAULT;
2091 }
2092 if (!strchr(fields, *field->name))
2093 return WALK_PRED_DEFAULT;
2094
2095 WARN_ON(!pred->fn);
2096 pred->fn = test_pred_visited_fn;
2097 }
2098 return WALK_PRED_DEFAULT;
2099}
2100
2101static __init int ftrace_test_event_filter(void)
2102{
2103 int i;
2104
2105 printk(KERN_INFO "Testing ftrace filter: ");
2106
2107 for (i = 0; i < DATA_CNT; i++) {
2108 struct event_filter *filter = NULL;
2109 struct test_filter_data_t *d = &test_filter_data[i];
2110 int err;
2111
2112 err = test_get_filter(d->filter, &event_ftrace_test_filter,
2113 &filter);
2114 if (err) {
2115 printk(KERN_INFO
2116 "Failed to get filter for '%s', err %d\n",
2117 d->filter, err);
2118 break;
2119 }
2120
2121 /*
2122 * The preemption disabling is not really needed for self
2123 * tests, but the rcu dereference will complain without it.
2124 */
2125 preempt_disable();
2126 if (*d->not_visited)
2127 walk_pred_tree(filter->preds, filter->root,
2128 test_walk_pred_cb,
2129 d->not_visited);
2130
2131 test_pred_visited = 0;
2132 err = filter_match_preds(filter, &d->rec);
2133 preempt_enable();
2134
2135 __free_filter(filter);
2136
2137 if (test_pred_visited) {
2138 printk(KERN_INFO
2139 "Failed, unwanted pred visited for filter %s\n",
2140 d->filter);
2141 break;
2142 }
2143
2144 if (err != d->match) {
2145 printk(KERN_INFO
2146 "Failed to match filter '%s', expected %d\n",
2147 d->filter, d->match);
2148 break;
2149 }
2150 }
2151
2152 if (i == DATA_CNT)
2153 printk(KERN_CONT "OK\n");
2154
2155 return 0;
2156}
2157
2158late_initcall(ftrace_test_event_filter);
2159
2160#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000000..bfd4dba0d603
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM test
3
4#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_TEST_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(ftrace_test_filter,
10
11 TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
12
13 TP_ARGS(a, b, c, d, e, f, g, h),
14
15 TP_STRUCT__entry(
16 __field(int, a)
17 __field(int, b)
18 __field(int, c)
19 __field(int, d)
20 __field(int, e)
21 __field(int, f)
22 __field(int, g)
23 __field(int, h)
24 ),
25
26 TP_fast_assign(
27 __entry->a = a;
28 __entry->b = b;
29 __entry->c = c;
30 __entry->d = d;
31 __entry->e = e;
32 __entry->f = f;
33 __entry->g = g;
34 __entry->h = h;
35 ),
36
37 TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
38 __entry->a, __entry->b, __entry->c, __entry->d,
39 __entry->e, __entry->f, __entry->g, __entry->h)
40);
41
42#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
43
44#undef TRACE_INCLUDE_PATH
45#undef TRACE_INCLUDE_FILE
46#define TRACE_INCLUDE_PATH .
47#define TRACE_INCLUDE_FILE trace_events_filter_test
48
49/* This part must be outside protection */
50#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly;
23 23
24static DEFINE_PER_CPU(int, tracing_cpu); 24static DEFINE_PER_CPU(int, tracing_cpu);
25 25
26static DEFINE_SPINLOCK(max_trace_lock); 26static DEFINE_RAW_SPINLOCK(max_trace_lock);
27 27
28enum { 28enum {
29 TRACER_IRQS_OFF = (1 << 1), 29 TRACER_IRQS_OFF = (1 << 1),
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 283static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 284static void irqsoff_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void irqsoff_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void irqsoff_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
@@ -321,7 +332,7 @@ check_critical_timing(struct trace_array *tr,
321 if (!report_latency(delta)) 332 if (!report_latency(delta))
322 goto out; 333 goto out;
323 334
324 spin_lock_irqsave(&max_trace_lock, flags); 335 raw_spin_lock_irqsave(&max_trace_lock, flags);
325 336
326 /* check if we are still the max latency */ 337 /* check if we are still the max latency */
327 if (!report_latency(delta)) 338 if (!report_latency(delta))
@@ -344,7 +355,7 @@ check_critical_timing(struct trace_array *tr,
344 max_sequence++; 355 max_sequence++;
345 356
346out_unlock: 357out_unlock:
347 spin_unlock_irqrestore(&max_trace_lock, flags); 358 raw_spin_unlock_irqrestore(&max_trace_lock, flags);
348 359
349out: 360out:
350 data->critical_sequence = max_sequence; 361 data->critical_sequence = max_sequence;
@@ -505,13 +516,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
505#ifdef CONFIG_PREEMPT_TRACER 516#ifdef CONFIG_PREEMPT_TRACER
506void trace_preempt_on(unsigned long a0, unsigned long a1) 517void trace_preempt_on(unsigned long a0, unsigned long a1)
507{ 518{
508 if (preempt_trace()) 519 if (preempt_trace() && !irq_trace())
509 stop_critical_timing(a0, a1); 520 stop_critical_timing(a0, a1);
510} 521}
511 522
512void trace_preempt_off(unsigned long a0, unsigned long a1) 523void trace_preempt_off(unsigned long a0, unsigned long a1)
513{ 524{
514 if (preempt_trace()) 525 if (preempt_trace() && !irq_trace())
515 start_critical_timing(a0, a1); 526 start_critical_timing(a0, a1);
516} 527}
517#endif /* CONFIG_PREEMPT_TRACER */ 528#endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e5..00d527c945a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
836} 836}
837 837
838/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
839static void unregister_trace_probe(struct trace_probe *tp) 839static int unregister_trace_probe(struct trace_probe *tp)
840{ 840{
841 /* Enabled event can not be unregistered */
842 if (trace_probe_is_enabled(tp))
843 return -EBUSY;
844
841 __unregister_trace_probe(tp); 845 __unregister_trace_probe(tp);
842 list_del(&tp->list); 846 list_del(&tp->list);
843 unregister_probe_event(tp); 847 unregister_probe_event(tp);
848
849 return 0;
844} 850}
845 851
846/* Register a trace_probe and probe_event */ 852/* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
854 /* Delete old (same name) event if exist */ 860 /* Delete old (same name) event if exist */
855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system); 861 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
856 if (old_tp) { 862 if (old_tp) {
857 unregister_trace_probe(old_tp); 863 ret = unregister_trace_probe(old_tp);
864 if (ret < 0)
865 goto end;
858 free_trace_probe(old_tp); 866 free_trace_probe(old_tp);
859 } 867 }
860 868
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
892 mutex_lock(&probe_lock); 900 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) { 901 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) { 902 if (trace_probe_within_module(tp, mod)) {
903 /* Don't need to check busy - this should have gone. */
895 __unregister_trace_probe(tp); 904 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp); 905 ret = __register_trace_probe(tp);
897 if (ret) 906 if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
1205 return -ENOENT; 1214 return -ENOENT;
1206 } 1215 }
1207 /* delete an event */ 1216 /* delete an event */
1208 unregister_trace_probe(tp); 1217 ret = unregister_trace_probe(tp);
1209 free_trace_probe(tp); 1218 if (ret == 0)
1219 free_trace_probe(tp);
1210 mutex_unlock(&probe_lock); 1220 mutex_unlock(&probe_lock);
1211 return 0; 1221 return ret;
1212 } 1222 }
1213 1223
1214 if (argc < 2) { 1224 if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
1317 return ret; 1327 return ret;
1318} 1328}
1319 1329
1320static void release_all_trace_probes(void) 1330static int release_all_trace_probes(void)
1321{ 1331{
1322 struct trace_probe *tp; 1332 struct trace_probe *tp;
1333 int ret = 0;
1323 1334
1324 mutex_lock(&probe_lock); 1335 mutex_lock(&probe_lock);
1336 /* Ensure no probe is in use. */
1337 list_for_each_entry(tp, &probe_list, list)
1338 if (trace_probe_is_enabled(tp)) {
1339 ret = -EBUSY;
1340 goto end;
1341 }
1325 /* TODO: Use batch unregistration */ 1342 /* TODO: Use batch unregistration */
1326 while (!list_empty(&probe_list)) { 1343 while (!list_empty(&probe_list)) {
1327 tp = list_entry(probe_list.next, struct trace_probe, list); 1344 tp = list_entry(probe_list.next, struct trace_probe, list);
1328 unregister_trace_probe(tp); 1345 unregister_trace_probe(tp);
1329 free_trace_probe(tp); 1346 free_trace_probe(tp);
1330 } 1347 }
1348
1349end:
1331 mutex_unlock(&probe_lock); 1350 mutex_unlock(&probe_lock);
1351
1352 return ret;
1332} 1353}
1333 1354
1334/* Probes listing interfaces */ 1355/* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
1380 1401
1381static int probes_open(struct inode *inode, struct file *file) 1402static int probes_open(struct inode *inode, struct file *file)
1382{ 1403{
1383 if ((file->f_mode & FMODE_WRITE) && 1404 int ret;
1384 (file->f_flags & O_TRUNC)) 1405
1385 release_all_trace_probes(); 1406 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
1407 ret = release_all_trace_probes();
1408 if (ret < 0)
1409 return ret;
1410 }
1386 1411
1387 return seq_open(file, &probes_seq_op); 1412 return seq_open(file, &probes_seq_op);
1388} 1413}
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
2055 2080
2056 ret = target(1, 2, 3, 4, 5, 6); 2081 ret = target(1, 2, 3, 4, 5, 6);
2057 2082
2083 /* Disable trace points before removing it */
2084 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
2085 if (WARN_ON_ONCE(tp == NULL)) {
2086 pr_warning("error on getting test probe.\n");
2087 warn++;
2088 } else
2089 disable_trace_probe(tp, TP_FLAG_TRACE);
2090
2091 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
2092 if (WARN_ON_ONCE(tp == NULL)) {
2093 pr_warning("error on getting 2nd test probe.\n");
2094 warn++;
2095 } else
2096 disable_trace_probe(tp, TP_FLAG_TRACE);
2097
2058 ret = command_trace_probe("-:testprobe"); 2098 ret = command_trace_probe("-:testprobe");
2059 if (WARN_ON_ONCE(ret)) { 2099 if (WARN_ON_ONCE(ret)) {
2060 pr_warning("error on deleting a probe.\n"); 2100 pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
627 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t; 628 unsigned long secs = (unsigned long)t;
629 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
630 int ret;
630 631
631 trace_find_cmdline(entry->pid, comm); 632 trace_find_cmdline(entry->pid, comm);
632 633
633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", 634 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
634 comm, entry->pid, iter->cpu, secs, usec_rem); 635 comm, entry->pid, iter->cpu);
636 if (!ret)
637 return 0;
638
639 if (trace_flags & TRACE_ITER_IRQ_INFO) {
640 ret = trace_print_lat_fmt(s, entry);
641 if (!ret)
642 return 0;
643 }
644
645 return trace_seq_printf(s, " %5lu.%06lu: ",
646 secs, usec_rem);
635} 647}
636 648
637int trace_print_lat_context(struct trace_iterator *iter) 649int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d7..6fd4ffd042f9 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
59 continue; 59 continue;
60 } 60 }
61 61
62 fmt = NULL;
62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); 63 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
63 if (tb_fmt) 64 if (tb_fmt) {
64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); 65 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) { 66 if (fmt) {
66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 67 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
67 strcpy(fmt, *iter); 68 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt; 69 tb_fmt->fmt = fmt;
69 *iter = tb_fmt->fmt; 70 } else
70 } else { 71 kfree(tb_fmt);
71 kfree(tb_fmt);
72 *iter = NULL;
73 } 72 }
73 *iter = fmt;
74
74 } 75 }
75 mutex_unlock(&btrace_mutex); 76 mutex_unlock(&btrace_mutex);
76} 77}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 283static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 284static void wakeup_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void wakeup_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void wakeup_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0bb9f8..cb654542c1a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,6 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
5#include <linux/ftrace.h> 6#include <linux/ftrace.h>
6#include <linux/perf_event.h> 7#include <linux/perf_event.h>
7#include <asm/syscall.h> 8#include <asm/syscall.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c54..db110b8ae030 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
34static const int tracepoint_debug; 34static const int tracepoint_debug;
35 35
36/* 36/*
37 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the 37 * Tracepoints mutex protects the builtin and module tracepoints and the hash
38 * builtin and module tracepoints and the hash table. 38 * table, as well as the local module list.
39 */ 39 */
40static DEFINE_MUTEX(tracepoints_mutex); 40static DEFINE_MUTEX(tracepoints_mutex);
41 41
42#ifdef CONFIG_MODULES
43/* Local list of struct module */
44static LIST_HEAD(tracepoint_module_list);
45#endif /* CONFIG_MODULES */
46
42/* 47/*
43 * Tracepoint hash table, containing the active tracepoints. 48 * Tracepoint hash table, containing the active tracepoints.
44 * Protected by tracepoints_mutex. 49 * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
292 * @end: end of the range 297 * @end: end of the range
293 * 298 *
294 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 * Called with tracepoints_mutex held.
295 */ 301 */
296void tracepoint_update_probe_range(struct tracepoint * const *begin, 302static void tracepoint_update_probe_range(struct tracepoint * const *begin,
297 struct tracepoint * const *end) 303 struct tracepoint * const *end)
298{ 304{
299 struct tracepoint * const *iter; 305 struct tracepoint * const *iter;
300 struct tracepoint_entry *mark_entry; 306 struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
302 if (!begin) 308 if (!begin)
303 return; 309 return;
304 310
305 mutex_lock(&tracepoints_mutex);
306 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
307 mark_entry = get_tracepoint((*iter)->name); 312 mark_entry = get_tracepoint((*iter)->name);
308 if (mark_entry) { 313 if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
312 disable_tracepoint(*iter); 317 disable_tracepoint(*iter);
313 } 318 }
314 } 319 }
315 mutex_unlock(&tracepoints_mutex);
316} 320}
317 321
322#ifdef CONFIG_MODULES
323void module_update_tracepoints(void)
324{
325 struct tp_module *tp_mod;
326
327 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
328 tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
329 tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
330}
331#else /* CONFIG_MODULES */
332void module_update_tracepoints(void)
333{
334}
335#endif /* CONFIG_MODULES */
336
337
318/* 338/*
319 * Update probes, removing the faulty probes. 339 * Update probes, removing the faulty probes.
340 * Called with tracepoints_mutex held.
320 */ 341 */
321static void tracepoint_update_probes(void) 342static void tracepoint_update_probes(void)
322{ 343{
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
359 380
360 mutex_lock(&tracepoints_mutex); 381 mutex_lock(&tracepoints_mutex);
361 old = tracepoint_add_probe(name, probe, data); 382 old = tracepoint_add_probe(name, probe, data);
362 mutex_unlock(&tracepoints_mutex); 383 if (IS_ERR(old)) {
363 if (IS_ERR(old)) 384 mutex_unlock(&tracepoints_mutex);
364 return PTR_ERR(old); 385 return PTR_ERR(old);
365 386 }
366 tracepoint_update_probes(); /* may update entry */ 387 tracepoint_update_probes(); /* may update entry */
388 mutex_unlock(&tracepoints_mutex);
367 release_probes(old); 389 release_probes(old);
368 return 0; 390 return 0;
369} 391}
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
402 424
403 mutex_lock(&tracepoints_mutex); 425 mutex_lock(&tracepoints_mutex);
404 old = tracepoint_remove_probe(name, probe, data); 426 old = tracepoint_remove_probe(name, probe, data);
405 mutex_unlock(&tracepoints_mutex); 427 if (IS_ERR(old)) {
406 if (IS_ERR(old)) 428 mutex_unlock(&tracepoints_mutex);
407 return PTR_ERR(old); 429 return PTR_ERR(old);
408 430 }
409 tracepoint_update_probes(); /* may update entry */ 431 tracepoint_update_probes(); /* may update entry */
432 mutex_unlock(&tracepoints_mutex);
410 release_probes(old); 433 release_probes(old);
411 return 0; 434 return 0;
412} 435}
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
489 if (!list_empty(&old_probes)) 512 if (!list_empty(&old_probes))
490 list_replace_init(&old_probes, &release_probes); 513 list_replace_init(&old_probes, &release_probes);
491 need_update = 0; 514 need_update = 0;
492 mutex_unlock(&tracepoints_mutex);
493
494 tracepoint_update_probes(); 515 tracepoint_update_probes();
516 mutex_unlock(&tracepoints_mutex);
495 list_for_each_entry_safe(pos, next, &release_probes, u.list) { 517 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
496 list_del(&pos->u.list); 518 list_del(&pos->u.list);
497 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); 519 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
509 * Will return the first tracepoint in the range if the input tracepoint is 531 * Will return the first tracepoint in the range if the input tracepoint is
510 * NULL. 532 * NULL.
511 */ 533 */
512int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, 534static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
513 struct tracepoint * const *begin, struct tracepoint * const *end) 535 struct tracepoint * const *begin, struct tracepoint * const *end)
514{ 536{
515 if (!*tracepoint && begin != end) { 537 if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
520 return 1; 542 return 1;
521 return 0; 543 return 0;
522} 544}
523EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
524 545
546#ifdef CONFIG_MODULES
525static void tracepoint_get_iter(struct tracepoint_iter *iter) 547static void tracepoint_get_iter(struct tracepoint_iter *iter)
526{ 548{
527 int found = 0; 549 int found = 0;
550 struct tp_module *iter_mod;
528 551
529 /* Core kernel tracepoints */ 552 /* Core kernel tracepoints */
530 if (!iter->module) { 553 if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 if (found) 557 if (found)
535 goto end; 558 goto end;
536 } 559 }
537 /* tracepoints in modules. */ 560 /* Tracepoints in modules */
538 found = module_get_iter_tracepoints(iter); 561 mutex_lock(&tracepoints_mutex);
562 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
563 /*
564 * Sorted module list
565 */
566 if (iter_mod < iter->module)
567 continue;
568 else if (iter_mod > iter->module)
569 iter->tracepoint = NULL;
570 found = tracepoint_get_iter_range(&iter->tracepoint,
571 iter_mod->tracepoints_ptrs,
572 iter_mod->tracepoints_ptrs
573 + iter_mod->num_tracepoints);
574 if (found) {
575 iter->module = iter_mod;
576 break;
577 }
578 }
579 mutex_unlock(&tracepoints_mutex);
539end: 580end:
540 if (!found) 581 if (!found)
541 tracepoint_iter_reset(iter); 582 tracepoint_iter_reset(iter);
542} 583}
584#else /* CONFIG_MODULES */
585static void tracepoint_get_iter(struct tracepoint_iter *iter)
586{
587 int found = 0;
588
589 /* Core kernel tracepoints */
590 found = tracepoint_get_iter_range(&iter->tracepoint,
591 __start___tracepoints_ptrs,
592 __stop___tracepoints_ptrs);
593 if (!found)
594 tracepoint_iter_reset(iter);
595}
596#endif /* CONFIG_MODULES */
543 597
544void tracepoint_iter_start(struct tracepoint_iter *iter) 598void tracepoint_iter_start(struct tracepoint_iter *iter)
545{ 599{
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
566 620
567void tracepoint_iter_reset(struct tracepoint_iter *iter) 621void tracepoint_iter_reset(struct tracepoint_iter *iter)
568{ 622{
623#ifdef CONFIG_MODULES
569 iter->module = NULL; 624 iter->module = NULL;
625#endif /* CONFIG_MODULES */
570 iter->tracepoint = NULL; 626 iter->tracepoint = NULL;
571} 627}
572EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 628EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
573 629
574#ifdef CONFIG_MODULES 630#ifdef CONFIG_MODULES
631static int tracepoint_module_coming(struct module *mod)
632{
633 struct tp_module *tp_mod, *iter;
634 int ret = 0;
635
636 /*
637 * We skip modules that tain the kernel, especially those with different
638 * module header (for forced load), to make sure we don't cause a crash.
639 */
640 if (mod->taints)
641 return 0;
642 mutex_lock(&tracepoints_mutex);
643 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
644 if (!tp_mod) {
645 ret = -ENOMEM;
646 goto end;
647 }
648 tp_mod->num_tracepoints = mod->num_tracepoints;
649 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
650
651 /*
652 * tracepoint_module_list is kept sorted by struct module pointer
653 * address for iteration on tracepoints from a seq_file that can release
654 * the mutex between calls.
655 */
656 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
657 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
658 if (iter < tp_mod) {
659 /* We belong to the location right after iter. */
660 list_add(&tp_mod->list, &iter->list);
661 goto module_added;
662 }
663 }
664 /* We belong to the beginning of the list */
665 list_add(&tp_mod->list, &tracepoint_module_list);
666module_added:
667 tracepoint_update_probe_range(mod->tracepoints_ptrs,
668 mod->tracepoints_ptrs + mod->num_tracepoints);
669end:
670 mutex_unlock(&tracepoints_mutex);
671 return ret;
672}
673
674static int tracepoint_module_going(struct module *mod)
675{
676 struct tp_module *pos;
677
678 mutex_lock(&tracepoints_mutex);
679 tracepoint_update_probe_range(mod->tracepoints_ptrs,
680 mod->tracepoints_ptrs + mod->num_tracepoints);
681 list_for_each_entry(pos, &tracepoint_module_list, list) {
682 if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
683 list_del(&pos->list);
684 kfree(pos);
685 break;
686 }
687 }
688 /*
689 * In the case of modules that were tainted at "coming", we'll simply
690 * walk through the list without finding it. We cannot use the "tainted"
691 * flag on "going", in case a module taints the kernel only after being
692 * loaded.
693 */
694 mutex_unlock(&tracepoints_mutex);
695 return 0;
696}
575 697
576int tracepoint_module_notify(struct notifier_block *self, 698int tracepoint_module_notify(struct notifier_block *self,
577 unsigned long val, void *data) 699 unsigned long val, void *data)
578{ 700{
579 struct module *mod = data; 701 struct module *mod = data;
702 int ret = 0;
580 703
581 switch (val) { 704 switch (val) {
582 case MODULE_STATE_COMING: 705 case MODULE_STATE_COMING:
706 ret = tracepoint_module_coming(mod);
707 break;
708 case MODULE_STATE_LIVE:
709 break;
583 case MODULE_STATE_GOING: 710 case MODULE_STATE_GOING:
584 tracepoint_update_probe_range(mod->tracepoints_ptrs, 711 ret = tracepoint_module_going(mod);
585 mod->tracepoints_ptrs + mod->num_tracepoints);
586 break; 712 break;
587 } 713 }
588 return 0; 714 return ret;
589} 715}
590 716
591struct notifier_block tracepoint_module_nb = { 717struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
598 return register_module_notifier(&tracepoint_module_nb); 724 return register_module_notifier(&tracepoint_module_nb);
599} 725}
600__initcall(init_tracepoints); 726__initcall(init_tracepoints);
601
602#endif /* CONFIG_MODULES */ 727#endif /* CONFIG_MODULES */
603 728
604#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 729#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/up.c b/kernel/up.c
index 1ff27a28bb7d..c54c75e9faf7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
4 4
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9 9
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 92cb706c7fc8..1744bb80f1fb 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
2#include <linux/user-return-notifier.h> 2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h> 3#include <linux/percpu.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/module.h> 5#include <linux/export.h>
6 6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); 7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8 8
diff --git a/kernel/user.c b/kernel/user.c
index 9e03e9c1df8d..71dd2363ab0f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,7 +14,7 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/* 20/*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9da289c34f22..3b906e98b1db 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,7 +5,7 @@
5 * License. 5 * License.
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b9510a..405caf91aad5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4d..63da38c2d820 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,10 +9,11 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h>
16 17
17static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
18{ 19{
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
51 uts_table.data = get_uts(table, write); 52 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 53 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 54 put_uts(table, write, uts_table.data);
55
56 if (write)
57 proc_sys_poll_notify(table->poll);
58
54 return r; 59 return r;
55} 60}
56#else 61#else
57#define proc_do_uts_string NULL 62#define proc_do_uts_string NULL
58#endif 63#endif
59 64
65static DEFINE_CTL_TABLE_POLL(hostname_poll);
66static DEFINE_CTL_TABLE_POLL(domainname_poll);
67
60static struct ctl_table uts_kern_table[] = { 68static struct ctl_table uts_kern_table[] = {
61 { 69 {
62 .procname = "ostype", 70 .procname = "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
85 .maxlen = sizeof(init_uts_ns.name.nodename), 93 .maxlen = sizeof(init_uts_ns.name.nodename),
86 .mode = 0644, 94 .mode = 0644,
87 .proc_handler = proc_do_uts_string, 95 .proc_handler = proc_do_uts_string,
96 .poll = &hostname_poll,
88 }, 97 },
89 { 98 {
90 .procname = "domainname", 99 .procname = "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
92 .maxlen = sizeof(init_uts_ns.name.domainname), 101 .maxlen = sizeof(init_uts_ns.name.domainname),
93 .mode = 0644, 102 .mode = 0644,
94 .proc_handler = proc_do_uts_string, 103 .proc_handler = proc_do_uts_string,
104 .poll = &domainname_poll,
95 }, 105 },
96 {} 106 {}
97}; 107};
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
105 {} 115 {}
106}; 116};
107 117
118#ifdef CONFIG_PROC_SYSCTL
119/*
120 * Notify userspace about a change in a certain entry of uts_kern_table,
121 * identified by the parameter proc.
122 */
123void uts_proc_notify(enum uts_proc proc)
124{
125 struct ctl_table *table = &uts_kern_table[proc];
126
127 proc_sys_poll_notify(table->poll);
128}
129#endif
130
108static int __init utsname_sysctl_init(void) 131static int __init utsname_sysctl_init(void)
109{ 132{
110 register_sysctl_table(uts_root_table); 133 register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index f45ea8d2a1ce..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -4,16 +4,16 @@
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key); 16 lockdep_set_class_and_name(&q->lock, key, name);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d4..1d7bca7f4f52 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param); 327 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 __set_current_state(TASK_RUNNING); 352 __set_current_state(TASK_RUNNING);
353 353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param);
354 return 0; 355 return 0;
355} 356}
356 357
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
438 439
439 /* create the watchdog thread */ 440 /* create the watchdog thread */
440 if (!p) { 441 if (!p) {
441 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
442 if (IS_ERR(p)) { 443 if (IS_ERR(p)) {
443 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
444 if (!err) { 445 if (!err) {
@@ -480,6 +481,8 @@ static void watchdog_disable(int cpu)
480 } 481 }
481} 482}
482 483
484/* sysctl functions */
485#ifdef CONFIG_SYSCTL
483static void watchdog_enable_all_cpus(void) 486static void watchdog_enable_all_cpus(void)
484{ 487{
485 int cpu; 488 int cpu;
@@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void)
509} 512}
510 513
511 514
512/* sysctl functions */
513#ifdef CONFIG_SYSCTL
514/* 515/*
515 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 516 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
516 */ 517 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1783aabc6128..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
23 * Please read Documentation/workqueue.txt for details. 23 * Please read Documentation/workqueue.txt for details.
24 */ 24 */
25 25
26#include <linux/module.h> 26#include <linux/export.h>
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/init.h> 29#include <linux/init.h>
@@ -242,10 +242,10 @@ struct workqueue_struct {
242 242
243 int nr_drainers; /* W: drain in progress */ 243 int nr_drainers; /* W: drain in progress */
244 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
246#ifdef CONFIG_LOCKDEP 245#ifdef CONFIG_LOCKDEP
247 struct lockdep_map lockdep_map; 246 struct lockdep_map lockdep_map;
248#endif 247#endif
248 char name[]; /* I: workqueue name */
249}; 249};
250 250
251struct workqueue_struct *system_wq __read_mostly; 251struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
2954 return clamp_val(max_active, 1, lim); 2954 return clamp_val(max_active, 1, lim);
2955} 2955}
2956 2956
2957struct workqueue_struct *__alloc_workqueue_key(const char *name, 2957struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2958 unsigned int flags, 2958 unsigned int flags,
2959 int max_active, 2959 int max_active,
2960 struct lock_class_key *key, 2960 struct lock_class_key *key,
2961 const char *lock_name) 2961 const char *lock_name, ...)
2962{ 2962{
2963 va_list args, args1;
2963 struct workqueue_struct *wq; 2964 struct workqueue_struct *wq;
2964 unsigned int cpu; 2965 unsigned int cpu;
2966 size_t namelen;
2967
2968 /* determine namelen, allocate wq and format name */
2969 va_start(args, lock_name);
2970 va_copy(args1, args);
2971 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
2972
2973 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
2974 if (!wq)
2975 goto err;
2976
2977 vsnprintf(wq->name, namelen, fmt, args1);
2978 va_end(args);
2979 va_end(args1);
2965 2980
2966 /* 2981 /*
2967 * Workqueues which may be used during memory reclaim should 2982 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2978 flags |= WQ_HIGHPRI; 2993 flags |= WQ_HIGHPRI;
2979 2994
2980 max_active = max_active ?: WQ_DFL_ACTIVE; 2995 max_active = max_active ?: WQ_DFL_ACTIVE;
2981 max_active = wq_clamp_max_active(max_active, flags, name); 2996 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2982
2983 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2984 if (!wq)
2985 goto err;
2986 2997
2998 /* init wq */
2987 wq->flags = flags; 2999 wq->flags = flags;
2988 wq->saved_max_active = max_active; 3000 wq->saved_max_active = max_active;
2989 mutex_init(&wq->flush_mutex); 3001 mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2991 INIT_LIST_HEAD(&wq->flusher_queue); 3003 INIT_LIST_HEAD(&wq->flusher_queue);
2992 INIT_LIST_HEAD(&wq->flusher_overflow); 3004 INIT_LIST_HEAD(&wq->flusher_overflow);
2993 3005
2994 wq->name = name;
2995 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3006 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2996 INIT_LIST_HEAD(&wq->list); 3007 INIT_LIST_HEAD(&wq->list);
2997 3008
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
3020 if (!rescuer) 3031 if (!rescuer)
3021 goto err; 3032 goto err;
3022 3033
3023 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); 3034 rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3035 wq->name);
3024 if (IS_ERR(rescuer->task)) 3036 if (IS_ERR(rescuer->task))
3025 goto err; 3037 goto err;
3026 3038