aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile20
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c191
-rw-r--r--kernel/events/core.c298
-rw-r--r--kernel/events/internal.h39
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/irq/irqdomain.c12
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c49
-rw-r--r--kernel/lockdep.c83
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/printk.c11
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c225
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2187
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)4
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1000
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)30
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)218
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/time/tick-sched.c105
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/timer.c62
-rw-r--r--kernel/trace/trace.c106
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_filter.c26
-rw-r--r--kernel/trace/trace_irqsoff.c13
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/wait.c4
58 files changed, 4206 insertions, 3050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26
27obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 102obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 103obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 104
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 111
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
123 113
124# config_data.h contains the same information as ikconfig.h but gzipped. 114# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..203dfead2e06 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 613 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 614 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 615 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 616 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 617 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 618 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 619 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 620 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..5ca38d5d238a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
178 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
179 for_each_process(p) { 179 for_each_process(p) {
180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
181 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
182 !cputime_eq(p->stime, cputime_zero)))
183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
184 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
185 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
380 cpu_maps_update_done(); 379 cpu_maps_update_done();
381 return err; 380 return err;
382} 381}
382EXPORT_SYMBOL_GPL(cpu_up);
383 383
384#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
385static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o 5obj-y := core.o ring_buffer.o callchain.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..057e24b665cf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,191 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118 if (err)
119 release_callchain_buffers();
120exit:
121 mutex_unlock(&callchain_mutex);
122
123 return err;
124}
125
126void put_callchain_buffers(void)
127{
128 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
129 release_callchain_buffers();
130 mutex_unlock(&callchain_mutex);
131 }
132}
133
134static struct perf_callchain_entry *get_callchain_entry(int *rctx)
135{
136 int cpu;
137 struct callchain_cpus_entries *entries;
138
139 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
140 if (*rctx == -1)
141 return NULL;
142
143 entries = rcu_dereference(callchain_cpus_entries);
144 if (!entries)
145 return NULL;
146
147 cpu = smp_processor_id();
148
149 return &entries->cpu_entries[cpu][*rctx];
150}
151
152static void
153put_callchain_entry(int rctx)
154{
155 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
156}
157
158struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
159{
160 int rctx;
161 struct perf_callchain_entry *entry;
162
163
164 entry = get_callchain_entry(&rctx);
165 if (rctx == -1)
166 return NULL;
167
168 if (!entry)
169 goto exit_put;
170
171 entry->nr = 0;
172
173 if (!user_mode(regs)) {
174 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
175 perf_callchain_kernel(entry, regs);
176 if (current->mm)
177 regs = task_pt_regs(current);
178 else
179 regs = NULL;
180 }
181
182 if (regs) {
183 perf_callchain_store(entry, PERF_CONTEXT_USER);
184 perf_callchain_user(entry, regs);
185 }
186
187exit_put:
188 put_callchain_entry(rctx);
189
190 return entry;
191}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58690af323e4..890eb02c2f21 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -128,7 +128,7 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 128 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 130 */
131struct jump_label_key perf_sched_events __read_mostly; 131struct jump_label_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
133 133
134static atomic_t nr_mmap_events __read_mostly; 134static atomic_t nr_mmap_events __read_mostly;
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
1130 if (!is_software_event(event)) 1130 if (!is_software_event(event))
1131 cpuctx->active_oncpu--; 1131 cpuctx->active_oncpu--;
1132 ctx->nr_active--; 1132 ctx->nr_active--;
1133 if (event->attr.freq && event->attr.sample_freq)
1134 ctx->nr_freq--;
1133 if (event->attr.exclusive || !cpuctx->active_oncpu) 1135 if (event->attr.exclusive || !cpuctx->active_oncpu)
1134 cpuctx->exclusive = 0; 1136 cpuctx->exclusive = 0;
1135} 1137}
@@ -1325,6 +1327,7 @@ retry:
1325 } 1327 }
1326 raw_spin_unlock_irq(&ctx->lock); 1328 raw_spin_unlock_irq(&ctx->lock);
1327} 1329}
1330EXPORT_SYMBOL_GPL(perf_event_disable);
1328 1331
1329static void perf_set_shadow_time(struct perf_event *event, 1332static void perf_set_shadow_time(struct perf_event *event,
1330 struct perf_event_context *ctx, 1333 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
1406 if (!is_software_event(event)) 1409 if (!is_software_event(event))
1407 cpuctx->active_oncpu++; 1410 cpuctx->active_oncpu++;
1408 ctx->nr_active++; 1411 ctx->nr_active++;
1412 if (event->attr.freq && event->attr.sample_freq)
1413 ctx->nr_freq++;
1409 1414
1410 if (event->attr.exclusive) 1415 if (event->attr.exclusive)
1411 cpuctx->exclusive = 1; 1416 cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
1662 * Note: this works for group members as well as group leaders 1667 * Note: this works for group members as well as group leaders
1663 * since the non-leader members' sibling_lists will be empty. 1668 * since the non-leader members' sibling_lists will be empty.
1664 */ 1669 */
1665static void __perf_event_mark_enabled(struct perf_event *event, 1670static void __perf_event_mark_enabled(struct perf_event *event)
1666 struct perf_event_context *ctx)
1667{ 1671{
1668 struct perf_event *sub; 1672 struct perf_event *sub;
1669 u64 tstamp = perf_event_time(event); 1673 u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
1701 */ 1705 */
1702 perf_cgroup_set_timestamp(current, ctx); 1706 perf_cgroup_set_timestamp(current, ctx);
1703 1707
1704 __perf_event_mark_enabled(event, ctx); 1708 __perf_event_mark_enabled(event);
1705 1709
1706 if (!event_filter_match(event)) { 1710 if (!event_filter_match(event)) {
1707 if (is_cgroup_event(event)) 1711 if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
1782 1786
1783retry: 1787retry:
1784 if (!ctx->is_active) { 1788 if (!ctx->is_active) {
1785 __perf_event_mark_enabled(event, ctx); 1789 __perf_event_mark_enabled(event);
1786 goto out; 1790 goto out;
1787 } 1791 }
1788 1792
@@ -1809,6 +1813,7 @@ retry:
1809out: 1813out:
1810 raw_spin_unlock_irq(&ctx->lock); 1814 raw_spin_unlock_irq(&ctx->lock);
1811} 1815}
1816EXPORT_SYMBOL_GPL(perf_event_enable);
1812 1817
1813int perf_event_refresh(struct perf_event *event, int refresh) 1818int perf_event_refresh(struct perf_event *event, int refresh)
1814{ 1819{
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2327 u64 interrupts, now; 2332 u64 interrupts, now;
2328 s64 delta; 2333 s64 delta;
2329 2334
2335 if (!ctx->nr_freq)
2336 return;
2337
2330 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2338 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2331 if (event->state != PERF_EVENT_STATE_ACTIVE) 2339 if (event->state != PERF_EVENT_STATE_ACTIVE)
2332 continue; 2340 continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2382{ 2390{
2383 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; 2391 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2384 struct perf_event_context *ctx = NULL; 2392 struct perf_event_context *ctx = NULL;
2385 int rotate = 0, remove = 1; 2393 int rotate = 0, remove = 1, freq = 0;
2386 2394
2387 if (cpuctx->ctx.nr_events) { 2395 if (cpuctx->ctx.nr_events) {
2388 remove = 0; 2396 remove = 0;
2389 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2397 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2390 rotate = 1; 2398 rotate = 1;
2399 if (cpuctx->ctx.nr_freq)
2400 freq = 1;
2391 } 2401 }
2392 2402
2393 ctx = cpuctx->task_ctx; 2403 ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2395 remove = 0; 2405 remove = 0;
2396 if (ctx->nr_events != ctx->nr_active) 2406 if (ctx->nr_events != ctx->nr_active)
2397 rotate = 1; 2407 rotate = 1;
2408 if (ctx->nr_freq)
2409 freq = 1;
2398 } 2410 }
2399 2411
2412 if (!rotate && !freq)
2413 goto done;
2414
2400 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2415 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2401 perf_pmu_disable(cpuctx->ctx.pmu); 2416 perf_pmu_disable(cpuctx->ctx.pmu);
2402 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2403 if (ctx)
2404 perf_ctx_adjust_freq(ctx, interval);
2405 2417
2406 if (!rotate) 2418 if (freq) {
2407 goto done; 2419 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2420 if (ctx)
2421 perf_ctx_adjust_freq(ctx, interval);
2422 }
2408 2423
2409 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2424 if (rotate) {
2410 if (ctx) 2425 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2411 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2426 if (ctx)
2427 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2412 2428
2413 rotate_ctx(&cpuctx->ctx); 2429 rotate_ctx(&cpuctx->ctx);
2414 if (ctx) 2430 if (ctx)
2415 rotate_ctx(ctx); 2431 rotate_ctx(ctx);
2416 2432
2417 perf_event_sched_in(cpuctx, ctx, current); 2433 perf_event_sched_in(cpuctx, ctx, current);
2434 }
2435
2436 perf_pmu_enable(cpuctx->ctx.pmu);
2437 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2418 2438
2419done: 2439done:
2420 if (remove) 2440 if (remove)
2421 list_del_init(&cpuctx->rotation_list); 2441 list_del_init(&cpuctx->rotation_list);
2422
2423 perf_pmu_enable(cpuctx->ctx.pmu);
2424 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2425} 2442}
2426 2443
2427void perf_event_task_tick(void) 2444void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
2448 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2465 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2449 return 0; 2466 return 0;
2450 2467
2451 __perf_event_mark_enabled(event, ctx); 2468 __perf_event_mark_enabled(event);
2452 2469
2453 return 1; 2470 return 1;
2454} 2471}
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2480 raw_spin_lock(&ctx->lock); 2497 raw_spin_lock(&ctx->lock);
2481 task_ctx_sched_out(ctx); 2498 task_ctx_sched_out(ctx);
2482 2499
2483 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2500 list_for_each_entry(event, &ctx->event_list, event_entry) {
2484 ret = event_enable_on_exec(event, ctx);
2485 if (ret)
2486 enabled = 1;
2487 }
2488
2489 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2490 ret = event_enable_on_exec(event, ctx); 2501 ret = event_enable_on_exec(event, ctx);
2491 if (ret) 2502 if (ret)
2492 enabled = 1; 2503 enabled = 1;
@@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
2574} 2585}
2575 2586
2576/* 2587/*
2577 * Callchain support
2578 */
2579
2580struct callchain_cpus_entries {
2581 struct rcu_head rcu_head;
2582 struct perf_callchain_entry *cpu_entries[0];
2583};
2584
2585static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2586static atomic_t nr_callchain_events;
2587static DEFINE_MUTEX(callchain_mutex);
2588struct callchain_cpus_entries *callchain_cpus_entries;
2589
2590
2591__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2592 struct pt_regs *regs)
2593{
2594}
2595
2596__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2597 struct pt_regs *regs)
2598{
2599}
2600
2601static void release_callchain_buffers_rcu(struct rcu_head *head)
2602{
2603 struct callchain_cpus_entries *entries;
2604 int cpu;
2605
2606 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2607
2608 for_each_possible_cpu(cpu)
2609 kfree(entries->cpu_entries[cpu]);
2610
2611 kfree(entries);
2612}
2613
2614static void release_callchain_buffers(void)
2615{
2616 struct callchain_cpus_entries *entries;
2617
2618 entries = callchain_cpus_entries;
2619 rcu_assign_pointer(callchain_cpus_entries, NULL);
2620 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2621}
2622
2623static int alloc_callchain_buffers(void)
2624{
2625 int cpu;
2626 int size;
2627 struct callchain_cpus_entries *entries;
2628
2629 /*
2630 * We can't use the percpu allocation API for data that can be
2631 * accessed from NMI. Use a temporary manual per cpu allocation
2632 * until that gets sorted out.
2633 */
2634 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2635
2636 entries = kzalloc(size, GFP_KERNEL);
2637 if (!entries)
2638 return -ENOMEM;
2639
2640 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2641
2642 for_each_possible_cpu(cpu) {
2643 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2644 cpu_to_node(cpu));
2645 if (!entries->cpu_entries[cpu])
2646 goto fail;
2647 }
2648
2649 rcu_assign_pointer(callchain_cpus_entries, entries);
2650
2651 return 0;
2652
2653fail:
2654 for_each_possible_cpu(cpu)
2655 kfree(entries->cpu_entries[cpu]);
2656 kfree(entries);
2657
2658 return -ENOMEM;
2659}
2660
2661static int get_callchain_buffers(void)
2662{
2663 int err = 0;
2664 int count;
2665
2666 mutex_lock(&callchain_mutex);
2667
2668 count = atomic_inc_return(&nr_callchain_events);
2669 if (WARN_ON_ONCE(count < 1)) {
2670 err = -EINVAL;
2671 goto exit;
2672 }
2673
2674 if (count > 1) {
2675 /* If the allocation failed, give up */
2676 if (!callchain_cpus_entries)
2677 err = -ENOMEM;
2678 goto exit;
2679 }
2680
2681 err = alloc_callchain_buffers();
2682 if (err)
2683 release_callchain_buffers();
2684exit:
2685 mutex_unlock(&callchain_mutex);
2686
2687 return err;
2688}
2689
2690static void put_callchain_buffers(void)
2691{
2692 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2693 release_callchain_buffers();
2694 mutex_unlock(&callchain_mutex);
2695 }
2696}
2697
2698static int get_recursion_context(int *recursion)
2699{
2700 int rctx;
2701
2702 if (in_nmi())
2703 rctx = 3;
2704 else if (in_irq())
2705 rctx = 2;
2706 else if (in_softirq())
2707 rctx = 1;
2708 else
2709 rctx = 0;
2710
2711 if (recursion[rctx])
2712 return -1;
2713
2714 recursion[rctx]++;
2715 barrier();
2716
2717 return rctx;
2718}
2719
2720static inline void put_recursion_context(int *recursion, int rctx)
2721{
2722 barrier();
2723 recursion[rctx]--;
2724}
2725
2726static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2727{
2728 int cpu;
2729 struct callchain_cpus_entries *entries;
2730
2731 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2732 if (*rctx == -1)
2733 return NULL;
2734
2735 entries = rcu_dereference(callchain_cpus_entries);
2736 if (!entries)
2737 return NULL;
2738
2739 cpu = smp_processor_id();
2740
2741 return &entries->cpu_entries[cpu][*rctx];
2742}
2743
2744static void
2745put_callchain_entry(int rctx)
2746{
2747 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2748}
2749
2750static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2751{
2752 int rctx;
2753 struct perf_callchain_entry *entry;
2754
2755
2756 entry = get_callchain_entry(&rctx);
2757 if (rctx == -1)
2758 return NULL;
2759
2760 if (!entry)
2761 goto exit_put;
2762
2763 entry->nr = 0;
2764
2765 if (!user_mode(regs)) {
2766 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2767 perf_callchain_kernel(entry, regs);
2768 if (current->mm)
2769 regs = task_pt_regs(current);
2770 else
2771 regs = NULL;
2772 }
2773
2774 if (regs) {
2775 perf_callchain_store(entry, PERF_CONTEXT_USER);
2776 perf_callchain_user(entry, regs);
2777 }
2778
2779exit_put:
2780 put_callchain_entry(rctx);
2781
2782 return entry;
2783}
2784
2785/*
2786 * Initialize the perf_event context in a task_struct: 2588 * Initialize the perf_event context in a task_struct:
2787 */ 2589 */
2788static void __perf_event_init_context(struct perf_event_context *ctx) 2590static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
2946 2748
2947 if (!event->parent) { 2749 if (!event->parent) {
2948 if (event->attach_state & PERF_ATTACH_TASK) 2750 if (event->attach_state & PERF_ATTACH_TASK)
2949 jump_label_dec(&perf_sched_events); 2751 jump_label_dec_deferred(&perf_sched_events);
2950 if (event->attr.mmap || event->attr.mmap_data) 2752 if (event->attr.mmap || event->attr.mmap_data)
2951 atomic_dec(&nr_mmap_events); 2753 atomic_dec(&nr_mmap_events);
2952 if (event->attr.comm) 2754 if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
2957 put_callchain_buffers(); 2759 put_callchain_buffers();
2958 if (is_cgroup_event(event)) { 2760 if (is_cgroup_event(event)) {
2959 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2761 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2960 jump_label_dec(&perf_sched_events); 2762 jump_label_dec_deferred(&perf_sched_events);
2961 } 2763 }
2962 } 2764 }
2963 2765
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4820 struct hw_perf_event *hwc = &event->hw; 4622 struct hw_perf_event *hwc = &event->hw;
4821 int throttle = 0; 4623 int throttle = 0;
4822 4624
4823 data->period = event->hw.last_period;
4824 if (!overflow) 4625 if (!overflow)
4825 overflow = perf_swevent_set_period(event); 4626 overflow = perf_swevent_set_period(event);
4826 4627
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4854 if (!is_sampling_event(event)) 4655 if (!is_sampling_event(event))
4855 return; 4656 return;
4856 4657
4658 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4659 data->period = nr;
4660 return perf_swevent_overflow(event, 1, data, regs);
4661 } else
4662 data->period = event->hw.last_period;
4663
4857 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4664 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4858 return perf_swevent_overflow(event, 1, data, regs); 4665 return perf_swevent_overflow(event, 1, data, regs);
4859 4666
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5366 regs = get_irq_regs(); 5173 regs = get_irq_regs();
5367 5174
5368 if (regs && !perf_exclude_event(event, regs)) { 5175 if (regs && !perf_exclude_event(event, regs)) {
5369 if (!(event->attr.exclude_idle && current->pid == 0)) 5176 if (!(event->attr.exclude_idle && is_idle_task(current)))
5370 if (perf_event_overflow(event, &data, regs)) 5177 if (perf_event_overflow(event, &data, regs))
5371 ret = HRTIMER_NORESTART; 5178 ret = HRTIMER_NORESTART;
5372 } 5179 }
@@ -5981,7 +5788,7 @@ done:
5981 5788
5982 if (!event->parent) { 5789 if (!event->parent) {
5983 if (event->attach_state & PERF_ATTACH_TASK) 5790 if (event->attach_state & PERF_ATTACH_TASK)
5984 jump_label_inc(&perf_sched_events); 5791 jump_label_inc(&perf_sched_events.key);
5985 if (event->attr.mmap || event->attr.mmap_data) 5792 if (event->attr.mmap || event->attr.mmap_data)
5986 atomic_inc(&nr_mmap_events); 5793 atomic_inc(&nr_mmap_events);
5987 if (event->attr.comm) 5794 if (event->attr.comm)
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
6219 * - that may need work on context switch 6026 * - that may need work on context switch
6220 */ 6027 */
6221 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6028 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6222 jump_label_inc(&perf_sched_events); 6029 jump_label_inc(&perf_sched_events.key);
6223 } 6030 }
6224 6031
6225 /* 6032 /*
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void)
7065 6872
7066 ret = init_hw_breakpoint(); 6873 ret = init_hw_breakpoint();
7067 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6874 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6875
6876 /* do not patch jump label more than once per second */
6877 jump_label_rate_limit(&perf_sched_events, HZ);
7068} 6878}
7069 6879
7070static int __init perf_event_sysfs_init(void) 6880static int __init perf_event_sysfs_init(void)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 64568a699375..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5
6/* Buffer handling */
7
4#define RING_BUFFER_WRITABLE 0x01 8#define RING_BUFFER_WRITABLE 0x01
5 9
6struct ring_buffer { 10struct ring_buffer {
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
67} 71}
68#endif 72#endif
69 73
70static unsigned long perf_data_size(struct ring_buffer *rb) 74static inline unsigned long perf_data_size(struct ring_buffer *rb)
71{ 75{
72 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
73} 77}
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
96 } while (len); 100 } while (len);
97} 101}
98 102
103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
105extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void);
107
108static inline int get_recursion_context(int *recursion)
109{
110 int rctx;
111
112 if (in_nmi())
113 rctx = 3;
114 else if (in_irq())
115 rctx = 2;
116 else if (in_softirq())
117 rctx = 1;
118 else
119 rctx = 0;
120
121 if (recursion[rctx])
122 return -1;
123
124 recursion[rctx]++;
125 barrier();
126
127 return rctx;
128}
129
130static inline void put_recursion_context(int *recursion, int rctx)
131{
132 barrier();
133 recursion[rctx]--;
134}
135
99#endif /* _KERNEL_EVENTS_INTERNAL_H */ 136#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index e6e01b959a0e..d579a459309d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 121 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 122 * will have been the last reference on the signal_struct.
123 */ 123 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 124 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 125 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 126 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 127 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 128 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 129 sig->nvcsw += tsk->nvcsw;
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1255 spin_lock_irq(&p->real_parent->sighand->siglock); 1255 spin_lock_irq(&p->real_parent->sighand->siglock);
1256 psig = p->real_parent->signal; 1256 psig = p->real_parent->signal;
1257 sig = p->signal; 1257 sig = p->signal;
1258 psig->cutime = 1258 psig->cutime += tgutime + sig->cutime;
1259 cputime_add(psig->cutime, 1259 psig->cstime += tgstime + sig->cstime;
1260 cputime_add(tgutime, 1260 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1261 sig->cutime));
1262 psig->cstime =
1263 cputime_add(psig->cstime,
1264 cputime_add(tgstime,
1265 sig->cstime));
1266 psig->cgtime =
1267 cputime_add(psig->cgtime,
1268 cputime_add(p->gtime,
1269 cputime_add(sig->gtime,
1270 sig->cgtime)));
1271 psig->cmin_flt += 1261 psig->cmin_flt +=
1272 p->min_flt + sig->min_flt + sig->cmin_flt; 1262 p->min_flt + sig->min_flt + sig->cmin_flt;
1273 psig->cmaj_flt += 1263 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..b058c5820ecd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1023 */ 1023 */
1024static void posix_cpu_timers_init(struct task_struct *tsk) 1024static void posix_cpu_timers_init(struct task_struct *tsk)
1025{ 1025{
1026 tsk->cputime_expires.prof_exp = cputime_zero; 1026 tsk->cputime_expires.prof_exp = 0;
1027 tsk->cputime_expires.virt_exp = cputime_zero; 1027 tsk->cputime_expires.virt_exp = 0;
1028 tsk->cputime_expires.sched_exp = 0; 1028 tsk->cputime_expires.sched_exp = 0;
1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 1132
1133 init_sigpending(&p->pending); 1133 init_sigpending(&p->pending);
1134 1134
1135 p->utime = cputime_zero; 1135 p->utime = p->stime = p->gtime = 0;
1136 p->stime = cputime_zero; 1136 p->utimescaled = p->stimescaled = 0;
1137 p->gtime = cputime_zero;
1138 p->utimescaled = cputime_zero;
1139 p->stimescaled = cputime_zero;
1140#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1137#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1141 p->prev_utime = cputime_zero; 1138 p->prev_utime = p->prev_stime = 0;
1142 p->prev_stime = cputime_zero;
1143#endif 1139#endif
1144#if defined(SPLIT_RSS_COUNTING) 1140#if defined(SPLIT_RSS_COUNTING)
1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1141 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..7ca523b249ef 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
143 return 0; 143 return 0;
144} 144}
145 145
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/** 146/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range 147 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */ 148 */
@@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
182} 177}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 178EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */ 179#endif /* CONFIG_OF_IRQ */
180
181struct irq_domain_ops irq_domain_simple_ops = {
182#ifdef CONFIG_OF_IRQ
183 .dt_translate = irq_domain_simple_dt_translate,
184#endif /* CONFIG_OF_IRQ */
185};
186EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 66ff7109f697..30c3c7708132 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key)
72 jump_label_unlock(); 72 jump_label_unlock();
73} 73}
74 74
75void jump_label_dec(struct jump_label_key *key) 75static void __jump_label_dec(struct jump_label_key *key,
76 unsigned long rate_limit, struct delayed_work *work)
76{ 77{
77 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
78 return; 79 return;
79 80
80 jump_label_update(key, JUMP_LABEL_DISABLE); 81 if (rate_limit) {
82 atomic_inc(&key->enabled);
83 schedule_delayed_work(work, rate_limit);
84 } else
85 jump_label_update(key, JUMP_LABEL_DISABLE);
86
81 jump_label_unlock(); 87 jump_label_unlock();
82} 88}
83 89
90static void jump_label_update_timeout(struct work_struct *work)
91{
92 struct jump_label_key_deferred *key =
93 container_of(work, struct jump_label_key_deferred, work.work);
94 __jump_label_dec(&key->key, 0, NULL);
95}
96
97void jump_label_dec(struct jump_label_key *key)
98{
99 __jump_label_dec(key, 0, NULL);
100}
101
102void jump_label_dec_deferred(struct jump_label_key_deferred *key)
103{
104 __jump_label_dec(&key->key, key->timeout, &key->work);
105}
106
107
108void jump_label_rate_limit(struct jump_label_key_deferred *key,
109 unsigned long rl)
110{
111 key->timeout = rl;
112 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
113}
114
84static int addr_conflict(struct jump_entry *entry, void *start, void *end) 115static int addr_conflict(struct jump_entry *entry, void *start, void *end)
85{ 116{
86 if (entry->code <= (unsigned long)end && 117 if (entry->code <= (unsigned long)end &&
@@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
111 * running code can override this to make the non-live update case 142 * running code can override this to make the non-live update case
112 * cheaper. 143 * cheaper.
113 */ 144 */
114void __weak arch_jump_label_transform_static(struct jump_entry *entry, 145void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
115 enum jump_label_type type) 146 enum jump_label_type type)
116{ 147{
117 arch_jump_label_transform(entry, type); 148 arch_jump_label_transform(entry, type);
@@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod)
217 if (iter_start == iter_stop) 248 if (iter_start == iter_stop)
218 return; 249 return;
219 250
220 for (iter = iter_start; iter < iter_stop; iter++) 251 for (iter = iter_start; iter < iter_stop; iter++) {
221 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 252 struct jump_label_key *iterk;
253
254 iterk = (struct jump_label_key *)(unsigned long)iter->key;
255 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
256 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
257 }
222} 258}
223 259
224static int jump_label_add_module(struct module *mod) 260static int jump_label_add_module(struct module *mod)
@@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod)
258 key->next = jlm; 294 key->next = jlm;
259 295
260 if (jump_label_enabled(key)) 296 if (jump_label_enabled(key))
261 __jump_label_update(key, iter, iter_stop, 297 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
262 JUMP_LABEL_ENABLE);
263 } 298 }
264 299
265 return 0; 300 return 0;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b2e08c932d91..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth;
431 * about it later on, in lockdep_info(). 431 * about it later on, in lockdep_info().
432 */ 432 */
433static int lockdep_init_error; 433static int lockdep_init_error;
434static const char *lock_init_error;
434static unsigned long lockdep_init_trace_data[20]; 435static unsigned long lockdep_init_trace_data[20];
435static struct stack_trace lockdep_init_trace = { 436static struct stack_trace lockdep_init_trace = {
436 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
499 usage[i] = '\0'; 500 usage[i] = '\0';
500} 501}
501 502
502static int __print_lock_name(struct lock_class *class) 503static void __print_lock_name(struct lock_class *class)
503{ 504{
504 char str[KSYM_NAME_LEN]; 505 char str[KSYM_NAME_LEN];
505 const char *name; 506 const char *name;
506 507
507 name = class->name; 508 name = class->name;
508 if (!name)
509 name = __get_key_name(class->key, str);
510
511 return printk("%s", name);
512}
513
514static void print_lock_name(struct lock_class *class)
515{
516 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
517 const char *name;
518
519 get_usage_chars(class, usage);
520
521 name = class->name;
522 if (!name) { 509 if (!name) {
523 name = __get_key_name(class->key, str); 510 name = __get_key_name(class->key, str);
524 printk(" (%s", name); 511 printk("%s", name);
525 } else { 512 } else {
526 printk(" (%s", name); 513 printk("%s", name);
527 if (class->name_version > 1) 514 if (class->name_version > 1)
528 printk("#%d", class->name_version); 515 printk("#%d", class->name_version);
529 if (class->subclass) 516 if (class->subclass)
530 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
531 } 518 }
519}
520
521static void print_lock_name(struct lock_class *class)
522{
523 char usage[LOCK_USAGE_CHARS];
524
525 get_usage_chars(class, usage);
526
527 printk(" (");
528 __print_lock_name(class);
532 printk("){%s}", usage); 529 printk("){%s}", usage);
533} 530}
534 531
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
568 } 565 }
569} 566}
570 567
571static void print_kernel_version(void) 568static void print_kernel_ident(void)
572{ 569{
573 printk("%s %.*s\n", init_utsname()->release, 570 printk("%s %.*s %s\n", init_utsname()->release,
574 (int)strcspn(init_utsname()->version, " "), 571 (int)strcspn(init_utsname()->version, " "),
575 init_utsname()->version); 572 init_utsname()->version,
573 print_tainted());
576} 574}
577 575
578static int very_verbose(struct lock_class *class) 576static int very_verbose(struct lock_class *class)
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656 if (unlikely(!lockdep_initialized)) { 654 if (unlikely(!lockdep_initialized)) {
657 lockdep_init(); 655 lockdep_init();
658 lockdep_init_error = 1; 656 lockdep_init_error = 1;
657 lock_init_error = lock->name;
659 save_stack_trace(&lockdep_init_trace); 658 save_stack_trace(&lockdep_init_trace);
660 } 659 }
661#endif 660#endif
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
723 722
724 class = look_up_lock_class(lock, subclass); 723 class = look_up_lock_class(lock, subclass);
725 if (likely(class)) 724 if (likely(class))
726 return class; 725 goto out_set_class_cache;
727 726
728 /* 727 /*
729 * Debug-check: all keys must be persistent! 728 * Debug-check: all keys must be persistent!
@@ -808,6 +807,7 @@ out_unlock_set:
808 graph_unlock(); 807 graph_unlock();
809 raw_local_irq_restore(flags); 808 raw_local_irq_restore(flags);
810 809
810out_set_class_cache:
811 if (!subclass || force) 811 if (!subclass || force)
812 lock->class_cache[0] = class; 812 lock->class_cache[0] = class;
813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1149 printk("\n"); 1149 printk("\n");
1150 printk("======================================================\n"); 1150 printk("======================================================\n");
1151 printk("[ INFO: possible circular locking dependency detected ]\n"); 1151 printk("[ INFO: possible circular locking dependency detected ]\n");
1152 print_kernel_version(); 1152 print_kernel_ident();
1153 printk("-------------------------------------------------------\n"); 1153 printk("-------------------------------------------------------\n");
1154 printk("%s/%d is trying to acquire lock:\n", 1154 printk("%s/%d is trying to acquire lock:\n",
1155 curr->comm, task_pid_nr(curr)); 1155 curr->comm, task_pid_nr(curr));
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1488 printk("======================================================\n"); 1488 printk("======================================================\n");
1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1490 irqclass, irqclass); 1490 irqclass, irqclass);
1491 print_kernel_version(); 1491 print_kernel_ident();
1492 printk("------------------------------------------------------\n"); 1492 printk("------------------------------------------------------\n");
1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1494 curr->comm, task_pid_nr(curr), 1494 curr->comm, task_pid_nr(curr),
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1717 printk("\n"); 1717 printk("\n");
1718 printk("=============================================\n"); 1718 printk("=============================================\n");
1719 printk("[ INFO: possible recursive locking detected ]\n"); 1719 printk("[ INFO: possible recursive locking detected ]\n");
1720 print_kernel_version(); 1720 print_kernel_ident();
1721 printk("---------------------------------------------\n"); 1721 printk("---------------------------------------------\n");
1722 printk("%s/%d is trying to acquire lock:\n", 1722 printk("%s/%d is trying to acquire lock:\n",
1723 curr->comm, task_pid_nr(curr)); 1723 curr->comm, task_pid_nr(curr));
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2224 printk("\n"); 2224 printk("\n");
2225 printk("=================================\n"); 2225 printk("=================================\n");
2226 printk("[ INFO: inconsistent lock state ]\n"); 2226 printk("[ INFO: inconsistent lock state ]\n");
2227 print_kernel_version(); 2227 print_kernel_ident();
2228 printk("---------------------------------\n"); 2228 printk("---------------------------------\n");
2229 2229
2230 printk("inconsistent {%s} -> {%s} usage.\n", 2230 printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
2289 printk("\n"); 2289 printk("\n");
2290 printk("=========================================================\n"); 2290 printk("=========================================================\n");
2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2292 print_kernel_version(); 2292 print_kernel_ident();
2293 printk("---------------------------------------------------------\n"); 2293 printk("---------------------------------------------------------\n");
2294 printk("%s/%d just changed the state of lock:\n", 2294 printk("%s/%d just changed the state of lock:\n",
2295 curr->comm, task_pid_nr(curr)); 2295 curr->comm, task_pid_nr(curr));
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3175 printk("\n"); 3175 printk("\n");
3176 printk("=====================================\n"); 3176 printk("=====================================\n");
3177 printk("[ BUG: bad unlock balance detected! ]\n"); 3177 printk("[ BUG: bad unlock balance detected! ]\n");
3178 print_kernel_ident();
3178 printk("-------------------------------------\n"); 3179 printk("-------------------------------------\n");
3179 printk("%s/%d is trying to release lock (", 3180 printk("%s/%d is trying to release lock (",
3180 curr->comm, task_pid_nr(curr)); 3181 curr->comm, task_pid_nr(curr));
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3619 printk("\n"); 3620 printk("\n");
3620 printk("=================================\n"); 3621 printk("=================================\n");
3621 printk("[ BUG: bad contention detected! ]\n"); 3622 printk("[ BUG: bad contention detected! ]\n");
3623 print_kernel_ident();
3622 printk("---------------------------------\n"); 3624 printk("---------------------------------\n");
3623 printk("%s/%d is trying to contend lock (", 3625 printk("%s/%d is trying to contend lock (",
3624 curr->comm, task_pid_nr(curr)); 3626 curr->comm, task_pid_nr(curr));
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void)
3974 3976
3975#ifdef CONFIG_DEBUG_LOCKDEP 3977#ifdef CONFIG_DEBUG_LOCKDEP
3976 if (lockdep_init_error) { 3978 if (lockdep_init_error) {
3977 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); 3979 printk("WARNING: lockdep init error! lock-%s was acquired"
3980 "before lockdep_init\n", lock_init_error);
3978 printk("Call stack leading to lockdep invocation was:\n"); 3981 printk("Call stack leading to lockdep invocation was:\n");
3979 print_stack_trace(&lockdep_init_trace, 0); 3982 print_stack_trace(&lockdep_init_trace, 0);
3980 } 3983 }
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3993 printk("\n"); 3996 printk("\n");
3994 printk("=========================\n"); 3997 printk("=========================\n");
3995 printk("[ BUG: held lock freed! ]\n"); 3998 printk("[ BUG: held lock freed! ]\n");
3999 print_kernel_ident();
3996 printk("-------------------------\n"); 4000 printk("-------------------------\n");
3997 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4001 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3998 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4002 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
4050 printk("\n"); 4054 printk("\n");
4051 printk("=====================================\n"); 4055 printk("=====================================\n");
4052 printk("[ BUG: lock held at task exit time! ]\n"); 4056 printk("[ BUG: lock held at task exit time! ]\n");
4057 print_kernel_ident();
4053 printk("-------------------------------------\n"); 4058 printk("-------------------------------------\n");
4054 printk("%s/%d is exiting with locks still held!\n", 4059 printk("%s/%d is exiting with locks still held!\n",
4055 curr->comm, task_pid_nr(curr)); 4060 curr->comm, task_pid_nr(curr));
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void)
4147 printk("\n"); 4152 printk("\n");
4148 printk("================================================\n"); 4153 printk("================================================\n");
4149 printk("[ BUG: lock held when returning to user space! ]\n"); 4154 printk("[ BUG: lock held when returning to user space! ]\n");
4155 print_kernel_ident();
4150 printk("------------------------------------------------\n"); 4156 printk("------------------------------------------------\n");
4151 printk("%s/%d is leaving the kernel with locks still held!\n", 4157 printk("%s/%d is leaving the kernel with locks still held!\n",
4152 curr->comm, curr->pid); 4158 curr->comm, curr->pid);
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4166 printk("\n"); 4172 printk("\n");
4167 printk("===============================\n"); 4173 printk("===============================\n");
4168 printk("[ INFO: suspicious RCU usage. ]\n"); 4174 printk("[ INFO: suspicious RCU usage. ]\n");
4175 print_kernel_ident();
4169 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4170 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4171 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4172 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4180
4181 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section
4183 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4184 * considers that CPU to be in an "extended quiescent state",
4185 * which means that RCU will be completely ignoring that CPU.
4186 * Therefore, rcu_read_lock() and friends have absolutely no
4187 * effect on a CPU running in that state. In other words, even if
4188 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4189 * delete data structures out from under it. RCU really has no
4190 * choice here: we need to keep an RCU-free window in idle where
4191 * the CPU may possibly enter into low power mode. This way we can
4192 * notice an extended quiescent state to other CPUs that started a grace
4193 * period. Otherwise we would delay any grace period as long as we run
4194 * in the idle task.
4195 *
4196 * So complain bitterly if someone does call rcu_read_lock(),
4197 * rcu_read_lock_bh() and so on from extended quiescent states.
4198 */
4199 if (rcu_is_cpu_idle())
4200 printk("RCU used illegally from extended quiescent state!\n");
4201
4173 lockdep_print_held_locks(curr); 4202 lockdep_print_held_locks(curr);
4174 printk("\nstack backtrace:\n"); 4203 printk("\nstack backtrace:\n");
4175 dump_stack(); 4204 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index b26593604214..3458469eb7c3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -237,11 +237,20 @@ void add_taint(unsigned flag)
237 * Can't trust the integrity of the kernel anymore. 237 * Can't trust the integrity of the kernel anymore.
238 * We don't call directly debug_locks_off() because the issue 238 * We don't call directly debug_locks_off() because the issue
239 * is not necessarily serious enough to set oops_in_progress to 1 239 * is not necessarily serious enough to set oops_in_progress to 1
240 * Also we want to keep up lockdep for staging development and 240 * Also we want to keep up lockdep for staging/out-of-tree
241 * post-warning case. 241 * development and post-warning case.
242 */ 242 */
243 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) 243 switch (flag) {
244 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); 244 case TAINT_CRAP:
245 case TAINT_OOT_MODULE:
246 case TAINT_WARN:
247 case TAINT_FIRMWARE_WORKAROUND:
248 break;
249
250 default:
251 if (__debug_locks_off())
252 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
253 }
245 254
246 set_bit(flag, &tainted_mask); 255 set_bit(flag, &tainted_mask);
247} 256}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 7982a0a841ea..989e4a52da76 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
199 unsigned long mem; 199 unsigned long mem;
200 200
201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR) 202 if (!mem)
203 return; 203 return;
204 new_log_buf = __va(mem); 204 new_log_buf = __va(mem);
205 } else { 205 } else {
@@ -688,6 +688,7 @@ static void zap_locks(void)
688 688
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 debug_locks_off();
691 /* If a crash is occurring, make sure we can't deadlock */ 692 /* If a crash is occurring, make sure we can't deadlock */
692 raw_spin_lock_init(&logbuf_lock); 693 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 694 /* And make sure that we print immediately */
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
840 boot_delay_msec(); 841 boot_delay_msec();
841 printk_delay(); 842 printk_delay();
842 843
843 preempt_disable();
844 /* This stops the holder of console_sem just where we want him */ 844 /* This stops the holder of console_sem just where we want him */
845 raw_local_irq_save(flags); 845 local_irq_save(flags);
846 this_cpu = smp_processor_id(); 846 this_cpu = smp_processor_id();
847 847
848 /* 848 /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 * recursion and return - but flag the recursion so that 856 * recursion and return - but flag the recursion so that
857 * it can be printed at the next appropriate moment: 857 * it can be printed at the next appropriate moment:
858 */ 858 */
859 if (!oops_in_progress) { 859 if (!oops_in_progress && !lockdep_recursing(current)) {
860 recursion_bug = 1; 860 recursion_bug = 1;
861 goto out_restore_irqs; 861 goto out_restore_irqs;
862 } 862 }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
962 962
963 lockdep_on(); 963 lockdep_on();
964out_restore_irqs: 964out_restore_irqs:
965 raw_local_irq_restore(flags); 965 local_irq_restore(flags);
966 966
967 preempt_enable();
968 return printed_len; 967 return printed_len;
969} 968}
970EXPORT_SYMBOL(printk); 969EXPORT_SYMBOL(printk);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
30#endif /* #else #ifdef CONFIG_RCU_TRACE */ 30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31 31
32/* 32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the 41 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. 42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
93{ 93{
94 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
95 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
96 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
97} 99}
98EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
316}; 318};
317EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
318#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state.
74 */ 98 */
75void rcu_exit_nohz(void) 99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
76{ 114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
135 */
136void rcu_idle_exit(void)
137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
175
176#endif /* #ifdef CONFIG_PROVE_RCU */
79 177
80#endif /* #ifdef CONFIG_NO_HZ */ 178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
126 232
127/* 233/*
128 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
129 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
130 */ 238 */
131void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
132{ 240{
133 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
134 (idle_cpu(cpu) &&
135 !in_softirq() &&
136 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
137 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
138 else if (!in_softirq()) 243 else if (!in_softirq())
139 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
154 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) { 260 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); 262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
158 return; 267 return;
159 } 268 }
160 269
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
183 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
184 } 293 }
185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); 295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
187} 298}
188 299
189static void rcu_process_callbacks(struct softirq_action *unused) 300static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
312 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
314 314
315 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
316 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
317} 317}
318 318
319/* 319/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
885 wake_up(&rcu_kthread_wq); 885 wake_up(&rcu_kthread_wq);
886} 886}
887 887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
888/* 901/*
889 * This kthread invokes RCU callbacks whose grace periods have 902 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the 903 * elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
938 raise_softirq(RCU_SOFTIRQ); 951 raise_softirq(RCU_SOFTIRQ);
939} 952}
940 953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
941void rcu_init(void) 966void rcu_init(void)
942{ 967{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
612 .name = "srcu" 632 .name = "srcu"
613}; 633};
614 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
615static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
616{ 660{
617 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
913 return 0; 957 return 0;
914} 958}
915 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
916/* 972/*
917 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
918 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
934 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
935 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
936 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
937 if (p == NULL) { 994 if (p == NULL) {
938 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
939 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
951 /* Should not happen, but... */ 1008 /* Should not happen, but... */
952 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
953 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
954 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
955 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
956 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
994 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
995 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
996 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
997 if (p == NULL) { 1057 if (p == NULL) {
998 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
999 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
1009 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1010 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1011 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1012 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1013 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1014 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
1056 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1058 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1059 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1060 rcu_torture_current, 1123 rcu_torture_current,
1061 rcu_torture_current_version, 1124 rcu_torture_current_version,
1062 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
1068 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1069 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1072 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1073 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1074 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1232 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1233 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1234 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1235 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1236 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1237 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1238 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1239 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1240 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1241} 1310}
1242 1311
1243static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1287 return 0; 1356 return 0;
1288} 1357}
1289 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1290static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1291 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1292{ 1486{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
1391 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1392 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1393 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1394 1593
1395 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1396 1595
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
1416 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1419 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1421 1620
1422 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
1607 } 1806 }
1608 } 1807 }
1609 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1610 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1611 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1612 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
71 }, \ 71 }, \
72 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
73 .gpnum = -300, \ 73 .gpnum = -300, \
74 .completed = -300, \ 74 .completed = -300, \
75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
195} 195}
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 197
198#ifdef CONFIG_NO_HZ
199DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
200 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
201 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
202}; 201};
203#endif /* #ifdef CONFIG_NO_HZ */
204 202
205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
206static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
328 return 1; 326 return 1;
329 } 327 }
330 328
331 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
332 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
333 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
334 332 * where idle loops fail to inform RCU about the CPU being idle.
335 /* The CPU is online, so send it a reschedule IPI. */ 333 */
336 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
337 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
338 else 336 else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
343 341
344#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
345 343
346#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
347 370
348/** 371/**
349 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
350 * 373 *
351 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
352 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
353 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
354 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
355 */ 382 */
356void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
357{ 384{
358 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
359 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
360 388
361 local_irq_save(flags); 389 local_irq_save(flags);
362 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
363 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
364 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
365 return; 393 rcu_idle_enter_common(rdtp, oldval);
366 }
367 trace_rcu_dyntick("Start");
368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
369 smp_mb__before_atomic_inc(); /* See above. */
370 atomic_inc(&rdtp->dynticks);
371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 local_irq_restore(flags); 394 local_irq_restore(flags);
374} 395}
375 396
376/* 397/**
377 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
378 * 403 *
379 * Exit nohz mode, in other words, -enter- the mode in which RCU 404 * This code assumes that the idle loop never does anything that might
380 * read-side critical sections normally occur. 405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
408 *
409 * Use things like work queues to work around this limitation.
410 *
411 * You have been warned.
381 */ 412 */
382void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
383{ 414{
384 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
385 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
386 418
387 local_irq_save(flags); 419 local_irq_save(flags);
388 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
389 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
390 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
391 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
392 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
393 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
394 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
396 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End"); 445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
399 local_irq_restore(flags); 519 local_irq_restore(flags);
400} 520}
401 521
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
442 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
443} 563}
444 564
565#ifdef CONFIG_PROVE_RCU
566
445/** 567/**
446 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
447 * 569 *
448 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
449 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
450 */ 572 */
451void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
452{ 574{
453 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
454} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
455 585
456/** 586/**
457 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
458 * 588 *
459 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
460 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
461 * with no ticks. 591 * disabled preemption.
462 */ 592 */
463void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
464{ 594{
465 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
466} 596}
467 597
468#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
475static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
476{ 606{
477 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
478 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
479} 609}
480 610
481/* 611/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
512 642
513#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
514 644
515#else /* #ifdef CONFIG_NO_HZ */
516
517#ifdef CONFIG_SMP
518
519static int dyntick_save_progress_counter(struct rcu_data *rdp)
520{
521 return 0;
522}
523
524static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
525{
526 return rcu_implicit_offline_qs(rdp);
527}
528
529#endif /* #ifdef CONFIG_SMP */
530
531#endif /* #else #ifdef CONFIG_NO_HZ */
532
533int rcu_cpu_stall_suppress __read_mostly;
534
535static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
536{ 646{
537 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
866 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
867 rsp->gpnum++; 977 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
872 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
873 983
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
877 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
878 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
879 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
881 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
882 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
927 1037
928 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
929 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
930 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
931 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
932 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
933} 1043}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
991 1101
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
994 rsp->signaled = RCU_GP_IDLE; 1104 rsp->fqs_state = RCU_GP_IDLE;
995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
996} 1106}
997 1107
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1221 else 1331 else
1222 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1223 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1224 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1225 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1226} 1336}
1227 1337
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1263 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0); 1375 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0); 1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1267 return; 1379 return;
1268 } 1380 }
1269 1381
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1291 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1292 __rcu_reclaim(rsp->name, list); 1404 __rcu_reclaim(rsp->name, list);
1293 list = next; 1405 list = next;
1294 if (++count >= bl) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1295 break; 1410 break;
1296 } 1411 }
1297 1412
1298 local_irq_save(flags); 1413 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count); 1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1300 1417
1301 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1302 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1335 * Also schedule RCU core processing. 1452 * Also schedule RCU core processing.
1336 * 1453 *
1337 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1339 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1340 */ 1457 */
1341void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1342{ 1459{
1343 trace_rcu_utilization("Start scheduler-tick"); 1460 trace_rcu_utilization("Start scheduler-tick");
1344 if (user || 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1347 1462
1348 /* 1463 /*
1349 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1457 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1458 } 1573 }
1459 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1460 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1461 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1462 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1463 1578
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1473 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1474 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1475 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1476 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1477 break; 1592 break;
1478 1593
1479 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
1812 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1813 * 1 if so. 1928 * 1 if so.
1814 */ 1929 */
1815static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1816{ 1931{
1817 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1818 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1913 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1914 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1915 rdp->qlen = 0; 2030 rdp->qlen = 0;
1916#ifdef CONFIG_NO_HZ
1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1918#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1919 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
1920 rdp->rsp = rsp; 2035 rdp->rsp = rsp;
1921 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1942 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1943 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1944 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1945 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1946 2065
1947 /* 2066 /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2023 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
2024 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
2025 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
2026 break; 2146 break;
2027 case CPU_DEAD: 2147 case CPU_DEAD:
2028 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -302,16 +299,12 @@ struct rcu_data {
302 struct rcu_state *rsp; 299 struct rcu_state *rsp;
303}; 300};
304 301
305/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
306#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
307#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
308#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
309#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
310#ifdef CONFIG_NO_HZ
311#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
312#else /* #ifdef CONFIG_NO_HZ */
313#define RCU_SIGNAL_INIT RCU_FORCE_QS
314#endif /* #else #ifdef CONFIG_NO_HZ */
315 308
316#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
317 310
@@ -361,7 +354,7 @@ struct rcu_state {
361 354
362 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
363 356
364 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
365 /* Force QS state. */ 358 /* Force QS state. */
366 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
367 /* is running. */ 360 /* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
451static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
452void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
453#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
454static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
455#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
456static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
457static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
463static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
464#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
465static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
466static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
473#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
474static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
475static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
476 474
477#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
312{ 312{
313 int empty; 313 int empty;
314 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
315 unsigned long flags; 316 unsigned long flags;
316 struct list_head *np; 317 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST 318#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
382 /* 383 /*
383 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
384 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
386 */ 388 */
389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
388 trace_rcu_quiescent_state_report("preempt_rcu", 391 trace_rcu_quiescent_state_report("preempt_rcu",
389 rnp->gpnum, 392 rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
406 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
407 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
408 */ 411 */
409 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
410 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
411 } else { 414 } else {
412 local_irq_restore(flags); 415 local_irq_restore(flags);
413 } 416 }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
729 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
730 * iteratively!) 733 * iteratively!)
731 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
732 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
733 */ 739 */
734static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
735{ 742{
736 unsigned long flags; 743 unsigned long flags;
737 unsigned long mask; 744 unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
744 } 751 }
745 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
746 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
747 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
748 break; 756 break;
749 } 757 }
750 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
777 must_wait = 1; 785 must_wait = 1;
778 } 786 }
779 if (!must_wait) 787 if (!must_wait)
780 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
781} 789}
782 790
783/* 791/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1069 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1070 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1071 */ 1079 */
1072static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1073{ 1082{
1074 return;
1075} 1083}
1076 1084
1077#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1157 1165
1158#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1166#endif /* #else #ifdef CONFIG_RCU_TRACE */
1159 1167
1160static struct lock_class_key rcu_boost_class;
1161
1162/* 1168/*
1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1169 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1164 * or ->boost_tasks, advancing the pointer to the next task in the 1170 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
1221 */ 1227 */
1222 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1223 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1227 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1228 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1231 1234
1232 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1233} 1237}
1234 1238
1235/* 1239/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
1329} 1333}
1330 1334
1331/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1332 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1333 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1334 * kthread. 1347 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1772 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1773} 1786}
1774 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1775static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1776{ 1794{
1777} 1795}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
1907 * grace period works for us. 1925 * grace period works for us.
1908 */ 1926 */
1909 get_online_cpus(); 1927 get_online_cpus();
1910 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1911 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1912 } 1930 }
1913 1931
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1939 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1940 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1941 * 1959 *
1942 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1943 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1944 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1945 */ 1962 */
1946int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1947{ 1964{
1948 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1973}
1974
1975/*
1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1977 * after it.
1978 */
1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1988{
1949} 1989}
1950 1990
1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1952 1992
1953#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1954static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1955static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1956 2031
1957/* 2032/*
1958 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1959 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1960 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1961 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1962 * 2095 *
1963 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1964 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1965 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1966 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1967 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1968 * 2102 *
1969 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1970 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1971 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1972 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1973 */ 2109 */
1974int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1975{ 2111{
1976 int c = 0; 2112 unsigned long flags;
1977 int snap; 2113
1978 int thatcpu; 2114 local_irq_save(flags);
1979 2115
1980 /* Check for being in the holdoff period. */ 2116 /*
1981 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1982 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1983 2119 */
1984 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1985 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1986 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1987 continue; 2123 local_irq_restore(flags);
1988 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1989 thatcpu).dynticks); 2125 return;
1990 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1991 if ((snap & 0x1) != 0) { 2127
1992 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1993 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1994 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1995 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1996 } 2136 }
1997 2137
1998 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1999 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2000 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
2001 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
2002 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2003 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
2004 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2005 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
2006 } 2158 }
2007 2159
2008 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2009 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2010 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
2011 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
2012 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
2013 } 2177 }
2014 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2015 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
2016 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
2017 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
2018 } 2183 }
2019 2184
2020 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
2021 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
2022 invoke_rcu_core(); 2192 invoke_rcu_core();
2023 return c; 2193 } else {
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained");
2196 }
2024} 2197}
2025 2198
2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
67 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
69 rdp->qs_pending); 69 rdp->qs_pending);
70#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
71 seq_printf(m, " dt=%d/%d/%d df=%lu",
72 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
73 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
74 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
75 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
76#endif /* #ifdef CONFIG_NO_HZ */
77 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
78 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
79 rdp->qlen, 77 rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
143 rdp->qs_pending); 141 rdp->qs_pending);
144#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
145 seq_printf(m, ",%d,%d,%d,%lu",
146 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
147 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
148 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
149 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
150#endif /* #ifdef CONFIG_NO_HZ */
151 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
152 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
153 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
171static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
172{ 168{
173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
174#ifdef CONFIG_NO_HZ
175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
176#endif /* #ifdef CONFIG_NO_HZ */
177 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
178#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
179 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
279 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
280 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
281 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
282 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
283 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
284 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273e..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
101 101
102 printk("\n============================================\n"); 102 printk("\n============================================\n");
103 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
104 printk( "--------------------------------------------\n"); 105 printk( "--------------------------------------------\n");
105 printk("%s/%d is deadlocking current task %s/%d\n\n", 106 printk("%s/%d is deadlocking current task %s/%d\n\n",
106 task->comm, task_pid_nr(task), 107 task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
583 582
584 for (;;) { 583 for (;;) {
585 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
602 601
603 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
604 603
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
609 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
610 605
611 schedule_rt_mutex(lock); 606 schedule_rt_mutex(lock);
612 607
613 if (was_disabled)
614 local_irq_disable();
615
616 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
617 set_current_state(state); 609 set_current_state(state);
618 } 610 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index a7f381a78469..2a4590fabcad 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -75,129 +74,17 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h> 78#include <asm/paravirt.h>
81#endif 79#endif
82 80
83#include "sched_cpupri.h" 81#include "sched.h"
84#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
85#include "sched_autogroup.h"
86 83
87#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 85#include <trace/events/sched.h>
89 86
90/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
109 * Helpers for converting nanosecond timing to jiffy resolution
110 */
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
120 * Timeslices get refilled after they expire.
121 */
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141/*
142 * This is the priority-queue data structure of the RT scheduling class:
143 */
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150 /* nests inside the rq lock: */
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{ 88{
202 unsigned long delta; 89 unsigned long delta;
203 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
217 } 104 }
218} 105}
219 106
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
221{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240/*
241 * sched_domains_mutex serializes calls to init_sched_domains,
242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272/* task group related information */
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307/* task_group_lock serializes the addition/removal of task groups */
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314/*
315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
331struct task_group root_task_group;
332
333#endif /* CONFIG_CGROUP_SCHED */
334
335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
376
377#ifdef CONFIG_SMP
378 /*
379 * the part of load.weight contributed by tasks
380 */
381 unsigned long task_weight;
382
383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
390
391 /*
392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
397 */
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr; /* highest queued rt task prio */
525#ifdef CONFIG_SMP
526 int next; /* next highest */
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539 /* Nests inside the rq lock: */
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
560 */
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568 /*
569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
580static struct root_domain def_root_domain;
581
582#endif /* CONFIG_SMP */
583
584/*
585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
591struct rq {
592 /* runqueue lock: */
593 raw_spinlock_t lock;
594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649 /* For active balancing */
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654 /* cpu of this runqueue: */
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
691
692 /* sys_sched_yield() stats */
693 unsigned int yld_count;
694
695 /* schedule() stats */
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700 /* try_to_wake_up() stats */
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728/*
729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
730 * See detach_destroy_domains: synchronize_sched for details.
731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790 109
791static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
792 111
793static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
794{ 113{
795 s64 delta; 114 s64 delta;
796 115
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
803} 122}
804 123
805/* 124/*
806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814/**
815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
816 * @cpu: the processor in question.
817 *
818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826/*
827 * Debugging: various feature bits 125 * Debugging: various feature bits
828 */ 126 */
829 127
830#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
841 130
842const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h" 132#include "features.h"
844 0; 133 0;
845 134
846#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
850 #name , 139 #name ,
851 140
852static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h" 142#include "features.h"
854 NULL 143 NULL
855}; 144};
856 145
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
860{ 149{
861 int i; 150 int i;
862 151
863 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
864 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
870 return 0; 159 return 0;
871} 160}
872 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
873static ssize_t 192static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
893 cmp += 3; 212 cmp += 3;
894 } 213 }
895 214
896 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg) 217 if (neg) {
899 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
900 else 219 sched_feat_disable(i);
220 } else {
901 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
902 break; 224 break;
903 } 225 }
904 } 226 }
905 227
906 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
907 return -EINVAL; 229 return -EINVAL;
908 230
909 *ppos += cnt; 231 *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
932 return 0; 254 return 0;
933} 255}
934late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
935 257#endif /* CONFIG_SCHED_DEBUG */
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939 258
940/* 259/*
941 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
957 */ 276 */
958unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
959 278
960static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
961 280
962/* 281/*
963 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
965 */ 284 */
966int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
967 286
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972 287
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
1030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061 /*
1062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1074 288
1075/* 289/*
1076 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
1153 * rq->lock. 367 * rq->lock.
1154 */ 368 */
1155 369
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1171{ 371{
1172 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
1210 * 410 *
1211 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1212 */ 412 */
1213static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1214{ 414{
1215 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
1254 * 454 *
1255 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1256 */ 456 */
1257static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1258{ 458{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif 506#endif
1307 507
1308static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1309{ 509{
1310 int cpu; 510 int cpu;
1311 511
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
1326 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1327} 527}
1328 528
1329static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1330{ 530{
1331 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags; 532 unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
1407 607
1408static inline bool got_nohz_idle_kick(void) 608static inline bool got_nohz_idle_kick(void)
1409{ 609{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1411} 612}
1412 613
1413#else /* CONFIG_NO_HZ */ 614#else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
1419 620
1420#endif /* CONFIG_NO_HZ */ 621#endif /* CONFIG_NO_HZ */
1421 622
1422static u64 sched_avg_period(void) 623void sched_avg_update(struct rq *rq)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{ 624{
1429 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1430 626
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
1440 } 636 }
1441} 637}
1442 638
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1450static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1451{ 641{
1452 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1454} 644}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1464 646
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473/*
1474 * Shift right and round:
1475 */
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478/*
1479 * delta *= weight / lw
1480 */
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
1561 */
1562static const int prio_to_weight[40] = {
1563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
1571};
1572
1573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
1580static const u32 prio_to_wmult[40] = {
1581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/* 649/*
1624 * Iterate task_group tree rooted at *from, calling @down when first entering a 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time. 651 * node and @up when leaving it for the final time.
1626 * 652 *
1627 * Caller must hold rcu_lock or sufficient equivalent. 653 * Caller must hold rcu_lock or sufficient equivalent.
1628 */ 654 */
1629static int walk_tg_tree_from(struct task_group *from, 655int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data) 656 tg_visitor down, tg_visitor up, void *data)
1631{ 657{
1632 struct task_group *parent, *child; 658 struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
1657 return ret; 683 return ret;
1658} 684}
1659 685
1660/* 686int tg_nop(struct task_group *tg, void *data)
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{ 687{
1674 return 0; 688 return 0;
1675} 689}
1676#endif 690#endif
1677 691
1678#ifdef CONFIG_SMP 692void update_cpu_load(struct rq *this_rq);
1679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0;
1734}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740/*
1741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
1747 */
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif
1887
1888static void calc_load_account_idle(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1899 * successfully executed on another CPU. We must ensure that updates of
1900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924 693
1925static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1926{ 695{
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1957/* 726/*
1958 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1959 */ 728 */
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{ 730{
1962 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1968/* 737/*
1969 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1970 */ 739 */
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{ 741{
1973 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2161{ 930{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2163 unsigned long flags; 932 unsigned long flags;
2164 u64 latest_ns; 933 u64 latest_ns;
2165 int ret = 0; 934 int ret = 0;
2166 935
2167 local_irq_save(flags); 936 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2170 ret = 1; 939 ret = 1;
2171 local_irq_restore(flags); 940 local_irq_restore(flags);
2172 return ret; 941 return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
2174 943
2175static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2176{ 945{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2178 unsigned long flags; 947 unsigned long flags;
2179 u64 latest_ns; 948 u64 latest_ns;
2180 int ret = 0; 949 int ret = 0;
2181 950
2182 local_irq_save(flags); 951 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2185 ret = 1; 954 ret = 1;
2186 local_irq_restore(flags); 955 local_irq_restore(flags);
2187 return ret; 956 return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
2193 962
2194#endif 963#endif
2195 964
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{ 966{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2299 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2300} 1060}
2301 1061
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{ 1063{
2304 const struct sched_class *class; 1064 const struct sched_class *class;
2305 1065
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2325} 1085}
2326 1086
2327#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2328/*
2329 * Is this task likely cache-hot:
2330 */
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342 /*
2343 * Buddy candidates are cache hot:
2344 */
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{ 1089{
2362#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2783 1511
2784} 1512}
2785#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1516{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518}
2786#endif /* CONFIG_SMP */ 1519#endif /* CONFIG_SMP */
2787 1520
2788static void ttwu_queue(struct task_struct *p, int cpu) 1521static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2790 struct rq *rq = cpu_rq(cpu); 1523 struct rq *rq = cpu_rq(cpu);
2791 1524
2792#if defined(CONFIG_SMP) 1525#if defined(CONFIG_SMP)
2793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2794 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2795 ttwu_queue_remote(p, cpu); 1528 ttwu_queue_remote(p, cpu);
2796 return; 1529 return;
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3204 local_irq_enable(); 1937 local_irq_enable();
3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3206 finish_lock_switch(rq, prev); 1939 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
3207 1941
3208 fire_sched_in_preempt_notifiers(current); 1942 fire_sched_in_preempt_notifiers(current);
3209 if (mm) 1943 if (mm)
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3439 */ 2173 */
3440static atomic_long_t calc_load_tasks_idle; 2174static atomic_long_t calc_load_tasks_idle;
3441 2175
3442static void calc_load_account_idle(struct rq *this_rq) 2176void calc_load_account_idle(struct rq *this_rq)
3443{ 2177{
3444 long delta; 2178 long delta;
3445 2179
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
3583 */ 2317 */
3584} 2318}
3585#else 2319#else
3586static void calc_load_account_idle(struct rq *this_rq) 2320void calc_load_account_idle(struct rq *this_rq)
3587{ 2321{
3588} 2322}
3589 2323
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2460 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies. 2461 * every tick. We fix it up based on jiffies.
3728 */ 2462 */
3729static void update_cpu_load(struct rq *this_rq) 2463void update_cpu_load(struct rq *this_rq)
3730{ 2464{
3731 unsigned long this_load = this_rq->load.weight; 2465 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies; 2466 unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2538,10 @@ unlock:
3804#endif 2538#endif
3805 2539
3806DEFINE_PER_CPU(struct kernel_stat, kstat); 2540DEFINE_PER_CPU(struct kernel_stat, kstat);
2541DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3807 2542
3808EXPORT_PER_CPU_SYMBOL(kstat); 2543EXPORT_PER_CPU_SYMBOL(kstat);
2544EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3809 2545
3810/* 2546/*
3811 * Return any ns on the sched_clock that have not yet been accounted in 2547 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3858 return ns; 2594 return ns;
3859} 2595}
3860 2596
2597#ifdef CONFIG_CGROUP_CPUACCT
2598struct cgroup_subsys cpuacct_subsys;
2599struct cpuacct root_cpuacct;
2600#endif
2601
2602static inline void task_group_account_field(struct task_struct *p, int index,
2603 u64 tmp)
2604{
2605#ifdef CONFIG_CGROUP_CPUACCT
2606 struct kernel_cpustat *kcpustat;
2607 struct cpuacct *ca;
2608#endif
2609 /*
2610 * Since all updates are sure to touch the root cgroup, we
2611 * get ourselves ahead and touch it first. If the root cgroup
2612 * is the only cgroup, then nothing else should be necessary.
2613 *
2614 */
2615 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2616
2617#ifdef CONFIG_CGROUP_CPUACCT
2618 if (unlikely(!cpuacct_subsys.active))
2619 return;
2620
2621 rcu_read_lock();
2622 ca = task_ca(p);
2623 while (ca && (ca != &root_cpuacct)) {
2624 kcpustat = this_cpu_ptr(ca->cpustat);
2625 kcpustat->cpustat[index] += tmp;
2626 ca = parent_ca(ca);
2627 }
2628 rcu_read_unlock();
2629#endif
2630}
2631
2632
3861/* 2633/*
3862 * Account user cpu time to a process. 2634 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to 2635 * @p: the process that the cpu time gets accounted to
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3867void account_user_time(struct task_struct *p, cputime_t cputime, 2639void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled) 2640 cputime_t cputime_scaled)
3869{ 2641{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2642 int index;
3871 cputime64_t tmp;
3872 2643
3873 /* Add user time to process. */ 2644 /* Add user time to process. */
3874 p->utime = cputime_add(p->utime, cputime); 2645 p->utime += cputime;
3875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2646 p->utimescaled += cputime_scaled;
3876 account_group_user_time(p, cputime); 2647 account_group_user_time(p, cputime);
3877 2648
2649 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2650
3878 /* Add user time to cpustat. */ 2651 /* Add user time to cpustat. */
3879 tmp = cputime_to_cputime64(cputime); 2652 task_group_account_field(p, index, (__force u64) cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
3884 2653
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3886 /* Account for user time used */ 2654 /* Account for user time used */
3887 acct_update_integrals(p); 2655 acct_update_integrals(p);
3888} 2656}
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3896static void account_guest_time(struct task_struct *p, cputime_t cputime, 2664static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled) 2665 cputime_t cputime_scaled)
3898{ 2666{
3899 cputime64_t tmp; 2667 u64 *cpustat = kcpustat_this_cpu->cpustat;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903 2668
3904 /* Add guest time to process. */ 2669 /* Add guest time to process. */
3905 p->utime = cputime_add(p->utime, cputime); 2670 p->utime += cputime;
3906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2671 p->utimescaled += cputime_scaled;
3907 account_group_user_time(p, cputime); 2672 account_group_user_time(p, cputime);
3908 p->gtime = cputime_add(p->gtime, cputime); 2673 p->gtime += cputime;
3909 2674
3910 /* Add guest time to cpustat. */ 2675 /* Add guest time to cpustat. */
3911 if (TASK_NICE(p) > 0) { 2676 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2677 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2678 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3914 } else { 2679 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp); 2680 cpustat[CPUTIME_USER] += (__force u64) cputime;
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2681 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3917 } 2682 }
3918} 2683}
3919 2684
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3926 */ 2691 */
3927static inline 2692static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime, 2693void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2694 cputime_t cputime_scaled, int index)
3930{ 2695{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933 /* Add system time to process. */ 2696 /* Add system time to process. */
3934 p->stime = cputime_add(p->stime, cputime); 2697 p->stime += cputime;
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2698 p->stimescaled += cputime_scaled;
3936 account_group_system_time(p, cputime); 2699 account_group_system_time(p, cputime);
3937 2700
3938 /* Add system time to cpustat. */ 2701 /* Add system time to cpustat. */
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2702 task_group_account_field(p, index, (__force u64) cputime);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941 2703
3942 /* Account for system time used */ 2704 /* Account for system time used */
3943 acct_update_integrals(p); 2705 acct_update_integrals(p);
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3953void account_system_time(struct task_struct *p, int hardirq_offset, 2715void account_system_time(struct task_struct *p, int hardirq_offset,
3954 cputime_t cputime, cputime_t cputime_scaled) 2716 cputime_t cputime, cputime_t cputime_scaled)
3955{ 2717{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2718 int index;
3957 cputime64_t *target_cputime64;
3958 2719
3959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2720 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3960 account_guest_time(p, cputime, cputime_scaled); 2721 account_guest_time(p, cputime, cputime_scaled);
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3962 } 2723 }
3963 2724
3964 if (hardirq_count() - hardirq_offset) 2725 if (hardirq_count() - hardirq_offset)
3965 target_cputime64 = &cpustat->irq; 2726 index = CPUTIME_IRQ;
3966 else if (in_serving_softirq()) 2727 else if (in_serving_softirq())
3967 target_cputime64 = &cpustat->softirq; 2728 index = CPUTIME_SOFTIRQ;
3968 else 2729 else
3969 target_cputime64 = &cpustat->system; 2730 index = CPUTIME_SYSTEM;
3970 2731
3971 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2732 __account_system_time(p, cputime, cputime_scaled, index);
3972} 2733}
3973 2734
3974/* 2735/*
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3977 */ 2738 */
3978void account_steal_time(cputime_t cputime) 2739void account_steal_time(cputime_t cputime)
3979{ 2740{
3980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2741 u64 *cpustat = kcpustat_this_cpu->cpustat;
3981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982 2742
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2743 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3984} 2744}
3985 2745
3986/* 2746/*
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
3989 */ 2749 */
3990void account_idle_time(cputime_t cputime) 2750void account_idle_time(cputime_t cputime)
3991{ 2751{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2752 u64 *cpustat = kcpustat_this_cpu->cpustat;
3993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3994 struct rq *rq = this_rq(); 2753 struct rq *rq = this_rq();
3995 2754
3996 if (atomic_read(&rq->nr_iowait) > 0) 2755 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2756 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3998 else 2757 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2758 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
4000} 2759}
4001 2760
4002static __always_inline bool steal_account_process_tick(void) 2761static __always_inline bool steal_account_process_tick(void)
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq) 2805 struct rq *rq)
4047{ 2806{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2807 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2808 u64 *cpustat = kcpustat_this_cpu->cpustat;
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051 2809
4052 if (steal_account_process_tick()) 2810 if (steal_account_process_tick())
4053 return; 2811 return;
4054 2812
4055 if (irqtime_account_hi_update()) { 2813 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2814 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4057 } else if (irqtime_account_si_update()) { 2815 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2816 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4059 } else if (this_cpu_ksoftirqd() == p) { 2817 } else if (this_cpu_ksoftirqd() == p) {
4060 /* 2818 /*
4061 * ksoftirqd time do not get accounted in cpu_softirq_time. 2819 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4063 * Also, p->stime needs to be updated for ksoftirqd. 2821 * Also, p->stime needs to be updated for ksoftirqd.
4064 */ 2822 */
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2823 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq); 2824 CPUTIME_SOFTIRQ);
4067 } else if (user_tick) { 2825 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2826 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) { 2827 } else if (p == rq->idle) {
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2830 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else { 2831 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2832 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system); 2833 CPUTIME_SYSTEM);
4076 } 2834 }
4077} 2835}
4078 2836
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4171 2929
4172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2930void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4173{ 2931{
4174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2932 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4175 2933
4176 /* 2934 /*
4177 * Use CFS's precise accounting: 2935 * Use CFS's precise accounting:
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2937 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4180 2938
4181 if (total) { 2939 if (total) {
4182 u64 temp = rtime; 2940 u64 temp = (__force u64) rtime;
4183 2941
4184 temp *= utime; 2942 temp *= (__force u64) utime;
4185 do_div(temp, total); 2943 do_div(temp, (__force u32) total);
4186 utime = (cputime_t)temp; 2944 utime = (__force cputime_t) temp;
4187 } else 2945 } else
4188 utime = rtime; 2946 utime = rtime;
4189 2947
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4191 * Compare with previous values, to keep monotonicity: 2949 * Compare with previous values, to keep monotonicity:
4192 */ 2950 */
4193 p->prev_utime = max(p->prev_utime, utime); 2951 p->prev_utime = max(p->prev_utime, utime);
4194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2952 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4195 2953
4196 *ut = p->prev_utime; 2954 *ut = p->prev_utime;
4197 *st = p->prev_stime; 2955 *st = p->prev_stime;
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4208 2966
4209 thread_group_cputime(p, &cputime); 2967 thread_group_cputime(p, &cputime);
4210 2968
4211 total = cputime_add(cputime.utime, cputime.stime); 2969 total = cputime.utime + cputime.stime;
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2970 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213 2971
4214 if (total) { 2972 if (total) {
4215 u64 temp = rtime; 2973 u64 temp = (__force u64) rtime;
4216 2974
4217 temp *= cputime.utime; 2975 temp *= (__force u64) cputime.utime;
4218 do_div(temp, total); 2976 do_div(temp, (__force u32) total);
4219 utime = (cputime_t)temp; 2977 utime = (__force cputime_t) temp;
4220 } else 2978 } else
4221 utime = rtime; 2979 utime = rtime;
4222 2980
4223 sig->prev_utime = max(sig->prev_utime, utime); 2981 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime, 2982 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4225 cputime_sub(rtime, sig->prev_utime));
4226 2983
4227 *ut = sig->prev_utime; 2984 *ut = sig->prev_utime;
4228 *st = sig->prev_stime; 2985 *st = sig->prev_stime;
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4321{ 3078{
4322 struct pt_regs *regs = get_irq_regs(); 3079 struct pt_regs *regs = get_irq_regs();
4323 3080
3081 if (oops_in_progress)
3082 return;
3083
4324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3084 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count()); 3085 prev->comm, prev->pid, preempt_count());
4326 3086
@@ -5852,6 +4612,13 @@ again:
5852 */ 4612 */
5853 if (preempt && rq != p_rq) 4613 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr); 4614 resched_task(p_rq->curr);
4615 } else {
4616 /*
4617 * We might have set it in task_yield_fair(), but are
4618 * not going to schedule(), so don't want to skip
4619 * the next update.
4620 */
4621 rq->skip_clock_update = 0;
5855 } 4622 }
5856 4623
5857out: 4624out:
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
6019 free = stack_not_used(p); 4786 free = stack_not_used(p);
6020#endif 4787#endif
6021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4788 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6022 task_pid_nr(p), task_pid_nr(p->real_parent), 4789 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6023 (unsigned long)task_thread_info(p)->flags); 4790 (unsigned long)task_thread_info(p)->flags);
6024 4791
6025 show_stack(p, NULL); 4792 show_stack(p, NULL);
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6118#endif 4885#endif
6119} 4886}
6120 4887
6121/*
6122 * Increase the granularity value when there are more CPUs,
6123 * because with more CPUs the 'effective latency' as visible
6124 * to users decreases. But the relationship is not linear,
6125 * so pick a second-best guess by going with the log2 of the
6126 * number of CPUs.
6127 *
6128 * This idea comes from the SD scheduler of Con Kolivas:
6129 */
6130static int get_update_sysctl_factor(void)
6131{
6132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
6147
6148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
6155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
6160#undef SET_SYSCTL
6161}
6162
6163static inline void sched_init_granularity(void)
6164{
6165 update_sysctl();
6166}
6167
6168#ifdef CONFIG_SMP 4888#ifdef CONFIG_SMP
6169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4889void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{ 4890{
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq)
6351 rq->calc_load_active = 0; 5071 rq->calc_load_active = 0;
6352} 5072}
6353 5073
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6378/* 5074/*
6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5075 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6380 * try_to_wake_up()->select_task_rq(). 5076 * try_to_wake_up()->select_task_rq().
@@ -6980,6 +5676,12 @@ out:
6980 return -ENOMEM; 5676 return -ENOMEM;
6981} 5677}
6982 5678
5679/*
5680 * By default the system creates a single root-domain with all cpus as
5681 * members (mimicking the global state we have today).
5682 */
5683struct root_domain def_root_domain;
5684
6983static void init_defrootdomain(void) 5685static void init_defrootdomain(void)
6984{ 5686{
6985 init_rootdomain(&def_root_domain); 5687 init_rootdomain(&def_root_domain);
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7051} 5753}
7052 5754
7053/* 5755/*
5756 * Keep a special pointer to the highest sched_domain that has
5757 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5758 * allows us to avoid some pointer chasing select_idle_sibling().
5759 *
5760 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache().
5763 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id);
5766
5767static void update_top_cache_domain(int cpu)
5768{
5769 struct sched_domain *sd;
5770 int id = cpu;
5771
5772 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5773 if (sd)
5774 id = cpumask_first(sched_domain_span(sd));
5775
5776 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5777 per_cpu(sd_llc_id, cpu) = id;
5778}
5779
5780/*
7054 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5781 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7055 * hold the hotplug lock. 5782 * hold the hotplug lock.
7056 */ 5783 */
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7089 tmp = rq->sd; 5816 tmp = rq->sd;
7090 rcu_assign_pointer(rq->sd, sd); 5817 rcu_assign_pointer(rq->sd, sd);
7091 destroy_sched_domains(tmp, cpu); 5818 destroy_sched_domains(tmp, cpu);
5819
5820 update_top_cache_domain(cpu);
7092} 5821}
7093 5822
7094/* cpus with isolated domains */ 5823/* cpus with isolated domains */
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7248 continue; 5977 continue;
7249 5978
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5979 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i)); 5980 GFP_KERNEL, cpu_to_node(cpu));
7252 5981
7253 if (!sg) 5982 if (!sg)
7254 goto fail; 5983 goto fail;
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7386 return; 6115 return;
7387 6116
7388 update_group_power(sd, cpu); 6117 update_group_power(sd, cpu);
6118 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6119}
6120
6121int __weak arch_sd_sibling_asym_packing(void)
6122{
6123 return 0*SD_ASYM_PACKING;
7389} 6124}
7390 6125
7391/* 6126/*
@@ -8021,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8021 } 6756 }
8022} 6757}
8023 6758
8024static int update_runtime(struct notifier_block *nfb,
8025 unsigned long action, void *hcpu)
8026{
8027 int cpu = (int)(long)hcpu;
8028
8029 switch (action) {
8030 case CPU_DOWN_PREPARE:
8031 case CPU_DOWN_PREPARE_FROZEN:
8032 disable_runtime(cpu_rq(cpu));
8033 return NOTIFY_OK;
8034
8035 case CPU_DOWN_FAILED:
8036 case CPU_DOWN_FAILED_FROZEN:
8037 case CPU_ONLINE:
8038 case CPU_ONLINE_FROZEN:
8039 enable_runtime(cpu_rq(cpu));
8040 return NOTIFY_OK;
8041
8042 default:
8043 return NOTIFY_DONE;
8044 }
8045}
8046
8047void __init sched_init_smp(void) 6759void __init sched_init_smp(void)
8048{ 6760{
8049 cpumask_var_t non_isolated_cpus; 6761 cpumask_var_t non_isolated_cpus;
@@ -8092,104 +6804,11 @@ int in_sched_functions(unsigned long addr)
8092 && addr < (unsigned long)__sched_text_end); 6804 && addr < (unsigned long)__sched_text_end);
8093} 6805}
8094 6806
8095static void init_cfs_rq(struct cfs_rq *cfs_rq) 6807#ifdef CONFIG_CGROUP_SCHED
8096{ 6808struct task_group root_task_group;
8097 cfs_rq->tasks_timeline = RB_ROOT;
8098 INIT_LIST_HEAD(&cfs_rq->tasks);
8099 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8100#ifndef CONFIG_64BIT
8101 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8102#endif
8103}
8104
8105static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8106{
8107 struct rt_prio_array *array;
8108 int i;
8109
8110 array = &rt_rq->active;
8111 for (i = 0; i < MAX_RT_PRIO; i++) {
8112 INIT_LIST_HEAD(array->queue + i);
8113 __clear_bit(i, array->bitmap);
8114 }
8115 /* delimiter for bitsearch: */
8116 __set_bit(MAX_RT_PRIO, array->bitmap);
8117
8118#if defined CONFIG_SMP
8119 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8120 rt_rq->highest_prio.next = MAX_RT_PRIO;
8121 rt_rq->rt_nr_migratory = 0;
8122 rt_rq->overloaded = 0;
8123 plist_head_init(&rt_rq->pushable_tasks);
8124#endif
8125
8126 rt_rq->rt_time = 0;
8127 rt_rq->rt_throttled = 0;
8128 rt_rq->rt_runtime = 0;
8129 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8130}
8131
8132#ifdef CONFIG_FAIR_GROUP_SCHED
8133static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8134 struct sched_entity *se, int cpu,
8135 struct sched_entity *parent)
8136{
8137 struct rq *rq = cpu_rq(cpu);
8138
8139 cfs_rq->tg = tg;
8140 cfs_rq->rq = rq;
8141#ifdef CONFIG_SMP
8142 /* allow initial update_cfs_load() to truncate */
8143 cfs_rq->load_stamp = 1;
8144#endif
8145 init_cfs_rq_runtime(cfs_rq);
8146
8147 tg->cfs_rq[cpu] = cfs_rq;
8148 tg->se[cpu] = se;
8149
8150 /* se could be NULL for root_task_group */
8151 if (!se)
8152 return;
8153
8154 if (!parent)
8155 se->cfs_rq = &rq->cfs;
8156 else
8157 se->cfs_rq = parent->my_q;
8158
8159 se->my_q = cfs_rq;
8160 update_load_set(&se->load, 0);
8161 se->parent = parent;
8162}
8163#endif 6809#endif
8164 6810
8165#ifdef CONFIG_RT_GROUP_SCHED 6811DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8166static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8167 struct sched_rt_entity *rt_se, int cpu,
8168 struct sched_rt_entity *parent)
8169{
8170 struct rq *rq = cpu_rq(cpu);
8171
8172 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8173 rt_rq->rt_nr_boosted = 0;
8174 rt_rq->rq = rq;
8175 rt_rq->tg = tg;
8176
8177 tg->rt_rq[cpu] = rt_rq;
8178 tg->rt_se[cpu] = rt_se;
8179
8180 if (!rt_se)
8181 return;
8182
8183 if (!parent)
8184 rt_se->rt_rq = &rq->rt;
8185 else
8186 rt_se->rt_rq = parent->my_q;
8187
8188 rt_se->my_q = rt_rq;
8189 rt_se->parent = parent;
8190 INIT_LIST_HEAD(&rt_se->run_list);
8191}
8192#endif
8193 6812
8194void __init sched_init(void) 6813void __init sched_init(void)
8195{ 6814{
@@ -8247,9 +6866,17 @@ void __init sched_init(void)
8247#ifdef CONFIG_CGROUP_SCHED 6866#ifdef CONFIG_CGROUP_SCHED
8248 list_add(&root_task_group.list, &task_groups); 6867 list_add(&root_task_group.list, &task_groups);
8249 INIT_LIST_HEAD(&root_task_group.children); 6868 INIT_LIST_HEAD(&root_task_group.children);
6869 INIT_LIST_HEAD(&root_task_group.siblings);
8250 autogroup_init(&init_task); 6870 autogroup_init(&init_task);
6871
8251#endif /* CONFIG_CGROUP_SCHED */ 6872#endif /* CONFIG_CGROUP_SCHED */
8252 6873
6874#ifdef CONFIG_CGROUP_CPUACCT
6875 root_cpuacct.cpustat = &kernel_cpustat;
6876 root_cpuacct.cpuusage = alloc_percpu(u64);
6877 /* Too early, not expected to fail */
6878 BUG_ON(!root_cpuacct.cpuusage);
6879#endif
8253 for_each_possible_cpu(i) { 6880 for_each_possible_cpu(i) {
8254 struct rq *rq; 6881 struct rq *rq;
8255 6882
@@ -8261,7 +6888,7 @@ void __init sched_init(void)
8261 init_cfs_rq(&rq->cfs); 6888 init_cfs_rq(&rq->cfs);
8262 init_rt_rq(&rq->rt, rq); 6889 init_rt_rq(&rq->rt, rq);
8263#ifdef CONFIG_FAIR_GROUP_SCHED 6890#ifdef CONFIG_FAIR_GROUP_SCHED
8264 root_task_group.shares = root_task_group_load; 6891 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8265 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6892 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8266 /* 6893 /*
8267 * How much cpu bandwidth does root_task_group get? 6894 * How much cpu bandwidth does root_task_group get?
@@ -8311,7 +6938,7 @@ void __init sched_init(void)
8311 rq->avg_idle = 2*sysctl_sched_migration_cost; 6938 rq->avg_idle = 2*sysctl_sched_migration_cost;
8312 rq_attach_root(rq, &def_root_domain); 6939 rq_attach_root(rq, &def_root_domain);
8313#ifdef CONFIG_NO_HZ 6940#ifdef CONFIG_NO_HZ
8314 rq->nohz_balance_kick = 0; 6941 rq->nohz_flags = 0;
8315#endif 6942#endif
8316#endif 6943#endif
8317 init_rq_hrtick(rq); 6944 init_rq_hrtick(rq);
@@ -8324,10 +6951,6 @@ void __init sched_init(void)
8324 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6951 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8325#endif 6952#endif
8326 6953
8327#ifdef CONFIG_SMP
8328 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8329#endif
8330
8331#ifdef CONFIG_RT_MUTEXES 6954#ifdef CONFIG_RT_MUTEXES
8332 plist_head_init(&init_task.pi_waiters); 6955 plist_head_init(&init_task.pi_waiters);
8333#endif 6956#endif
@@ -8355,17 +6978,11 @@ void __init sched_init(void)
8355 6978
8356#ifdef CONFIG_SMP 6979#ifdef CONFIG_SMP
8357 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6980 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8358#ifdef CONFIG_NO_HZ
8359 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8360 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8361 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8362 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8363 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8364#endif
8365 /* May be allocated at isolcpus cmdline parse time */ 6981 /* May be allocated at isolcpus cmdline parse time */
8366 if (cpu_isolated_map == NULL) 6982 if (cpu_isolated_map == NULL)
8367 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6983 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8368#endif /* SMP */ 6984#endif
6985 init_sched_fair_class();
8369 6986
8370 scheduler_running = 1; 6987 scheduler_running = 1;
8371} 6988}
@@ -8517,169 +7134,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8517 7134
8518#endif 7135#endif
8519 7136
8520#ifdef CONFIG_FAIR_GROUP_SCHED
8521static void free_fair_sched_group(struct task_group *tg)
8522{
8523 int i;
8524
8525 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8526
8527 for_each_possible_cpu(i) {
8528 if (tg->cfs_rq)
8529 kfree(tg->cfs_rq[i]);
8530 if (tg->se)
8531 kfree(tg->se[i]);
8532 }
8533
8534 kfree(tg->cfs_rq);
8535 kfree(tg->se);
8536}
8537
8538static
8539int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8540{
8541 struct cfs_rq *cfs_rq;
8542 struct sched_entity *se;
8543 int i;
8544
8545 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8546 if (!tg->cfs_rq)
8547 goto err;
8548 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8549 if (!tg->se)
8550 goto err;
8551
8552 tg->shares = NICE_0_LOAD;
8553
8554 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8555
8556 for_each_possible_cpu(i) {
8557 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8558 GFP_KERNEL, cpu_to_node(i));
8559 if (!cfs_rq)
8560 goto err;
8561
8562 se = kzalloc_node(sizeof(struct sched_entity),
8563 GFP_KERNEL, cpu_to_node(i));
8564 if (!se)
8565 goto err_free_rq;
8566
8567 init_cfs_rq(cfs_rq);
8568 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8569 }
8570
8571 return 1;
8572
8573err_free_rq:
8574 kfree(cfs_rq);
8575err:
8576 return 0;
8577}
8578
8579static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8580{
8581 struct rq *rq = cpu_rq(cpu);
8582 unsigned long flags;
8583
8584 /*
8585 * Only empty task groups can be destroyed; so we can speculatively
8586 * check on_list without danger of it being re-added.
8587 */
8588 if (!tg->cfs_rq[cpu]->on_list)
8589 return;
8590
8591 raw_spin_lock_irqsave(&rq->lock, flags);
8592 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8593 raw_spin_unlock_irqrestore(&rq->lock, flags);
8594}
8595#else /* !CONFIG_FAIR_GROUP_SCHED */
8596static inline void free_fair_sched_group(struct task_group *tg)
8597{
8598}
8599
8600static inline
8601int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8602{
8603 return 1;
8604}
8605
8606static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8607{
8608}
8609#endif /* CONFIG_FAIR_GROUP_SCHED */
8610
8611#ifdef CONFIG_RT_GROUP_SCHED 7137#ifdef CONFIG_RT_GROUP_SCHED
8612static void free_rt_sched_group(struct task_group *tg)
8613{
8614 int i;
8615
8616 if (tg->rt_se)
8617 destroy_rt_bandwidth(&tg->rt_bandwidth);
8618
8619 for_each_possible_cpu(i) {
8620 if (tg->rt_rq)
8621 kfree(tg->rt_rq[i]);
8622 if (tg->rt_se)
8623 kfree(tg->rt_se[i]);
8624 }
8625
8626 kfree(tg->rt_rq);
8627 kfree(tg->rt_se);
8628}
8629
8630static
8631int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8632{
8633 struct rt_rq *rt_rq;
8634 struct sched_rt_entity *rt_se;
8635 int i;
8636
8637 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8638 if (!tg->rt_rq)
8639 goto err;
8640 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8641 if (!tg->rt_se)
8642 goto err;
8643
8644 init_rt_bandwidth(&tg->rt_bandwidth,
8645 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8646
8647 for_each_possible_cpu(i) {
8648 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8649 GFP_KERNEL, cpu_to_node(i));
8650 if (!rt_rq)
8651 goto err;
8652
8653 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8654 GFP_KERNEL, cpu_to_node(i));
8655 if (!rt_se)
8656 goto err_free_rq;
8657
8658 init_rt_rq(rt_rq, cpu_rq(i));
8659 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8660 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8661 }
8662
8663 return 1;
8664
8665err_free_rq:
8666 kfree(rt_rq);
8667err:
8668 return 0;
8669}
8670#else /* !CONFIG_RT_GROUP_SCHED */ 7138#else /* !CONFIG_RT_GROUP_SCHED */
8671static inline void free_rt_sched_group(struct task_group *tg)
8672{
8673}
8674
8675static inline
8676int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8677{
8678 return 1;
8679}
8680#endif /* CONFIG_RT_GROUP_SCHED */ 7139#endif /* CONFIG_RT_GROUP_SCHED */
8681 7140
8682#ifdef CONFIG_CGROUP_SCHED 7141#ifdef CONFIG_CGROUP_SCHED
7142/* task_group_lock serializes the addition/removal of task groups */
7143static DEFINE_SPINLOCK(task_group_lock);
7144
8683static void free_sched_group(struct task_group *tg) 7145static void free_sched_group(struct task_group *tg)
8684{ 7146{
8685 free_fair_sched_group(tg); 7147 free_fair_sched_group(tg);
@@ -8785,47 +7247,6 @@ void sched_move_task(struct task_struct *tsk)
8785#endif /* CONFIG_CGROUP_SCHED */ 7247#endif /* CONFIG_CGROUP_SCHED */
8786 7248
8787#ifdef CONFIG_FAIR_GROUP_SCHED 7249#ifdef CONFIG_FAIR_GROUP_SCHED
8788static DEFINE_MUTEX(shares_mutex);
8789
8790int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8791{
8792 int i;
8793 unsigned long flags;
8794
8795 /*
8796 * We can't change the weight of the root cgroup.
8797 */
8798 if (!tg->se[0])
8799 return -EINVAL;
8800
8801 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8802
8803 mutex_lock(&shares_mutex);
8804 if (tg->shares == shares)
8805 goto done;
8806
8807 tg->shares = shares;
8808 for_each_possible_cpu(i) {
8809 struct rq *rq = cpu_rq(i);
8810 struct sched_entity *se;
8811
8812 se = tg->se[i];
8813 /* Propagate contribution to hierarchy */
8814 raw_spin_lock_irqsave(&rq->lock, flags);
8815 for_each_sched_entity(se)
8816 update_cfs_shares(group_cfs_rq(se));
8817 raw_spin_unlock_irqrestore(&rq->lock, flags);
8818 }
8819
8820done:
8821 mutex_unlock(&shares_mutex);
8822 return 0;
8823}
8824
8825unsigned long sched_group_shares(struct task_group *tg)
8826{
8827 return tg->shares;
8828}
8829#endif 7250#endif
8830 7251
8831#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7252#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8850,7 +7271,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8850 struct task_struct *g, *p; 7271 struct task_struct *g, *p;
8851 7272
8852 do_each_thread(g, p) { 7273 do_each_thread(g, p) {
8853 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7274 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8854 return 1; 7275 return 1;
8855 } while_each_thread(g, p); 7276 } while_each_thread(g, p);
8856 7277
@@ -9201,8 +7622,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9201 7622
9202static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7623static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9203{ 7624{
9204 int i, ret = 0, runtime_enabled; 7625 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9205 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7626 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9206 7627
9207 if (tg == &root_task_group) 7628 if (tg == &root_task_group)
9208 return -EINVAL; 7629 return -EINVAL;
@@ -9229,6 +7650,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9229 goto out_unlock; 7650 goto out_unlock;
9230 7651
9231 runtime_enabled = quota != RUNTIME_INF; 7652 runtime_enabled = quota != RUNTIME_INF;
7653 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7654 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9232 raw_spin_lock_irq(&cfs_b->lock); 7655 raw_spin_lock_irq(&cfs_b->lock);
9233 cfs_b->period = ns_to_ktime(period); 7656 cfs_b->period = ns_to_ktime(period);
9234 cfs_b->quota = quota; 7657 cfs_b->quota = quota;
@@ -9244,13 +7667,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9244 7667
9245 for_each_possible_cpu(i) { 7668 for_each_possible_cpu(i) {
9246 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7669 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9247 struct rq *rq = rq_of(cfs_rq); 7670 struct rq *rq = cfs_rq->rq;
9248 7671
9249 raw_spin_lock_irq(&rq->lock); 7672 raw_spin_lock_irq(&rq->lock);
9250 cfs_rq->runtime_enabled = runtime_enabled; 7673 cfs_rq->runtime_enabled = runtime_enabled;
9251 cfs_rq->runtime_remaining = 0; 7674 cfs_rq->runtime_remaining = 0;
9252 7675
9253 if (cfs_rq_throttled(cfs_rq)) 7676 if (cfs_rq->throttled)
9254 unthrottle_cfs_rq(cfs_rq); 7677 unthrottle_cfs_rq(cfs_rq);
9255 raw_spin_unlock_irq(&rq->lock); 7678 raw_spin_unlock_irq(&rq->lock);
9256 } 7679 }
@@ -9264,7 +7687,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9264{ 7687{
9265 u64 quota, period; 7688 u64 quota, period;
9266 7689
9267 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7690 period = ktime_to_ns(tg->cfs_bandwidth.period);
9268 if (cfs_quota_us < 0) 7691 if (cfs_quota_us < 0)
9269 quota = RUNTIME_INF; 7692 quota = RUNTIME_INF;
9270 else 7693 else
@@ -9277,10 +7700,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9277{ 7700{
9278 u64 quota_us; 7701 u64 quota_us;
9279 7702
9280 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7703 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9281 return -1; 7704 return -1;
9282 7705
9283 quota_us = tg_cfs_bandwidth(tg)->quota; 7706 quota_us = tg->cfs_bandwidth.quota;
9284 do_div(quota_us, NSEC_PER_USEC); 7707 do_div(quota_us, NSEC_PER_USEC);
9285 7708
9286 return quota_us; 7709 return quota_us;
@@ -9291,10 +7714,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9291 u64 quota, period; 7714 u64 quota, period;
9292 7715
9293 period = (u64)cfs_period_us * NSEC_PER_USEC; 7716 period = (u64)cfs_period_us * NSEC_PER_USEC;
9294 quota = tg_cfs_bandwidth(tg)->quota; 7717 quota = tg->cfs_bandwidth.quota;
9295
9296 if (period <= 0)
9297 return -EINVAL;
9298 7718
9299 return tg_set_cfs_bandwidth(tg, period, quota); 7719 return tg_set_cfs_bandwidth(tg, period, quota);
9300} 7720}
@@ -9303,7 +7723,7 @@ long tg_get_cfs_period(struct task_group *tg)
9303{ 7723{
9304 u64 cfs_period_us; 7724 u64 cfs_period_us;
9305 7725
9306 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7726 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9307 do_div(cfs_period_us, NSEC_PER_USEC); 7727 do_div(cfs_period_us, NSEC_PER_USEC);
9308 7728
9309 return cfs_period_us; 7729 return cfs_period_us;
@@ -9363,13 +7783,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9363static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7783static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9364{ 7784{
9365 struct cfs_schedulable_data *d = data; 7785 struct cfs_schedulable_data *d = data;
9366 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7786 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9367 s64 quota = 0, parent_quota = -1; 7787 s64 quota = 0, parent_quota = -1;
9368 7788
9369 if (!tg->parent) { 7789 if (!tg->parent) {
9370 quota = RUNTIME_INF; 7790 quota = RUNTIME_INF;
9371 } else { 7791 } else {
9372 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7792 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9373 7793
9374 quota = normalize_cfs_quota(tg, d); 7794 quota = normalize_cfs_quota(tg, d);
9375 parent_quota = parent_b->hierarchal_quota; 7795 parent_quota = parent_b->hierarchal_quota;
@@ -9413,7 +7833,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9413 struct cgroup_map_cb *cb) 7833 struct cgroup_map_cb *cb)
9414{ 7834{
9415 struct task_group *tg = cgroup_tg(cgrp); 7835 struct task_group *tg = cgroup_tg(cgrp);
9416 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7836 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9417 7837
9418 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7838 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9419 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7839 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9514,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9514 * (balbir@in.ibm.com). 7934 * (balbir@in.ibm.com).
9515 */ 7935 */
9516 7936
9517/* track cpu usage of a group of tasks and its child groups */
9518struct cpuacct {
9519 struct cgroup_subsys_state css;
9520 /* cpuusage holds pointer to a u64-type object on every cpu */
9521 u64 __percpu *cpuusage;
9522 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9523 struct cpuacct *parent;
9524};
9525
9526struct cgroup_subsys cpuacct_subsys;
9527
9528/* return cpu accounting group corresponding to this container */
9529static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9530{
9531 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9532 struct cpuacct, css);
9533}
9534
9535/* return cpu accounting group to which this task belongs */
9536static inline struct cpuacct *task_ca(struct task_struct *tsk)
9537{
9538 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9539 struct cpuacct, css);
9540}
9541
9542/* create a new cpu accounting group */ 7937/* create a new cpu accounting group */
9543static struct cgroup_subsys_state *cpuacct_create( 7938static struct cgroup_subsys_state *cpuacct_create(
9544 struct cgroup_subsys *ss, struct cgroup *cgrp) 7939 struct cgroup_subsys *ss, struct cgroup *cgrp)
9545{ 7940{
9546 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7941 struct cpuacct *ca;
9547 int i;
9548 7942
7943 if (!cgrp->parent)
7944 return &root_cpuacct.css;
7945
7946 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9549 if (!ca) 7947 if (!ca)
9550 goto out; 7948 goto out;
9551 7949
@@ -9553,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9553 if (!ca->cpuusage) 7951 if (!ca->cpuusage)
9554 goto out_free_ca; 7952 goto out_free_ca;
9555 7953
9556 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7954 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9557 if (percpu_counter_init(&ca->cpustat[i], 0)) 7955 if (!ca->cpustat)
9558 goto out_free_counters; 7956 goto out_free_cpuusage;
9559
9560 if (cgrp->parent)
9561 ca->parent = cgroup_ca(cgrp->parent);
9562 7957
9563 return &ca->css; 7958 return &ca->css;
9564 7959
9565out_free_counters: 7960out_free_cpuusage:
9566 while (--i >= 0)
9567 percpu_counter_destroy(&ca->cpustat[i]);
9568 free_percpu(ca->cpuusage); 7961 free_percpu(ca->cpuusage);
9569out_free_ca: 7962out_free_ca:
9570 kfree(ca); 7963 kfree(ca);
@@ -9577,10 +7970,8 @@ static void
9577cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7970cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9578{ 7971{
9579 struct cpuacct *ca = cgroup_ca(cgrp); 7972 struct cpuacct *ca = cgroup_ca(cgrp);
9580 int i;
9581 7973
9582 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7974 free_percpu(ca->cpustat);
9583 percpu_counter_destroy(&ca->cpustat[i]);
9584 free_percpu(ca->cpuusage); 7975 free_percpu(ca->cpuusage);
9585 kfree(ca); 7976 kfree(ca);
9586} 7977}
@@ -9673,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = {
9673}; 8064};
9674 8065
9675static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8066static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9676 struct cgroup_map_cb *cb) 8067 struct cgroup_map_cb *cb)
9677{ 8068{
9678 struct cpuacct *ca = cgroup_ca(cgrp); 8069 struct cpuacct *ca = cgroup_ca(cgrp);
9679 int i; 8070 int cpu;
8071 s64 val = 0;
8072
8073 for_each_online_cpu(cpu) {
8074 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8075 val += kcpustat->cpustat[CPUTIME_USER];
8076 val += kcpustat->cpustat[CPUTIME_NICE];
8077 }
8078 val = cputime64_to_clock_t(val);
8079 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
9680 8080
9681 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8081 val = 0;
9682 s64 val = percpu_counter_read(&ca->cpustat[i]); 8082 for_each_online_cpu(cpu) {
9683 val = cputime64_to_clock_t(val); 8083 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9684 cb->fill(cb, cpuacct_stat_desc[i], val); 8084 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8085 val += kcpustat->cpustat[CPUTIME_IRQ];
8086 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
9685 } 8087 }
8088
8089 val = cputime64_to_clock_t(val);
8090 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8091
9686 return 0; 8092 return 0;
9687} 8093}
9688 8094
@@ -9712,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9712 * 8118 *
9713 * called with rq->lock held. 8119 * called with rq->lock held.
9714 */ 8120 */
9715static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8121void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9716{ 8122{
9717 struct cpuacct *ca; 8123 struct cpuacct *ca;
9718 int cpu; 8124 int cpu;
@@ -9726,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9726 8132
9727 ca = task_ca(tsk); 8133 ca = task_ca(tsk);
9728 8134
9729 for (; ca; ca = ca->parent) { 8135 for (; ca; ca = parent_ca(ca)) {
9730 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8136 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9731 *cpuusage += cputime; 8137 *cpuusage += cputime;
9732 } 8138 }
@@ -9734,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9734 rcu_read_unlock(); 8140 rcu_read_unlock();
9735} 8141}
9736 8142
9737/*
9738 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9739 * in cputime_t units. As a result, cpuacct_update_stats calls
9740 * percpu_counter_add with values large enough to always overflow the
9741 * per cpu batch limit causing bad SMP scalability.
9742 *
9743 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9744 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9745 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9746 */
9747#ifdef CONFIG_SMP
9748#define CPUACCT_BATCH \
9749 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9750#else
9751#define CPUACCT_BATCH 0
9752#endif
9753
9754/*
9755 * Charge the system/user time to the task's accounting group.
9756 */
9757static void cpuacct_update_stats(struct task_struct *tsk,
9758 enum cpuacct_stat_index idx, cputime_t val)
9759{
9760 struct cpuacct *ca;
9761 int batch = CPUACCT_BATCH;
9762
9763 if (unlikely(!cpuacct_subsys.active))
9764 return;
9765
9766 rcu_read_lock();
9767 ca = task_ca(tsk);
9768
9769 do {
9770 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9771 ca = ca->parent;
9772 } while (ca);
9773 rcu_read_unlock();
9774}
9775
9776struct cgroup_subsys cpuacct_subsys = { 8143struct cgroup_subsys cpuacct_subsys = {
9777 .name = "cpuacct", 8144 .name = "cpuacct",
9778 .create = cpuacct_create, 8145 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 8a39fa3e3c6c..8e42de9105f8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 if (unlikely(delta > se->statistics.sleep_max)) 1003 if (unlikely(delta > se->statistics.sleep_max))
894 se->statistics.sleep_max = delta; 1004 se->statistics.sleep_max = delta;
895 1005
896 se->statistics.sleep_start = 0;
897 se->statistics.sum_sleep_runtime += delta; 1006 se->statistics.sum_sleep_runtime += delta;
898 1007
899 if (tsk) { 1008 if (tsk) {
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
910 if (unlikely(delta > se->statistics.block_max)) 1019 if (unlikely(delta > se->statistics.block_max))
911 se->statistics.block_max = delta; 1020 se->statistics.block_max = delta;
912 1021
913 se->statistics.block_start = 0;
914 se->statistics.sum_sleep_runtime += delta; 1022 se->statistics.sum_sleep_runtime += delta;
915 1023
916 if (tsk) { 1024 if (tsk) {
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
920 trace_sched_stat_iowait(tsk, delta); 1028 trace_sched_stat_iowait(tsk, delta);
921 } 1029 }
922 1030
1031 trace_sched_stat_blocked(tsk, delta);
1032
923 /* 1033 /*
924 * Blocking time is in units of nanosecs, so shift by 1034 * Blocking time is in units of nanosecs, so shift by
925 * 20 to get a milliseconds-range estimation of the 1035 * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1397 */
1288 1398
1289#ifdef CONFIG_CFS_BANDWIDTH 1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1290/* 1426/*
1291 * default period for cfs group bandwidth. 1427 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1428 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1444 *
1309 * requires cfs_b->lock 1445 * requires cfs_b->lock
1310 */ 1446 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1448{
1313 u64 now; 1449 u64 now;
1314 1450
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1457}
1322 1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1323/* returns 0 on failure to allocate runtime */ 1464/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1466{
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec) 1563 unsigned long delta_exec)
1423{ 1564{
1424 if (!cfs_rq->runtime_enabled) 1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1425 return; 1566 return;
1426 1567
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1568 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1429 1570
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1571static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{ 1572{
1432 return cfs_rq->throttled; 1573 return cfs_bandwidth_used() && cfs_rq->throttled;
1433} 1574}
1434 1575
1435/* check whether cfs_rq, or any parent, is throttled */ 1576/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1577static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{ 1578{
1438 return cfs_rq->throttle_count; 1579 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1439} 1580}
1440 1581
1441/* 1582/*
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1671 raw_spin_unlock(&cfs_b->lock);
1531} 1672}
1532 1673
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1675{
1535 struct rq *rq = rq_of(cfs_rq); 1676 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1756 1897
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1898static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{ 1899{
1900 if (!cfs_bandwidth_used())
1901 return;
1902
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 1903 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return; 1904 return;
1761 1905
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1801 */ 1945 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1946static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{ 1947{
1948 if (!cfs_bandwidth_used())
1949 return;
1950
1804 /* an active group must be handled by the update_curr()->put() path */ 1951 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1952 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return; 1953 return;
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1965/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1966static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{ 1967{
1968 if (!cfs_bandwidth_used())
1969 return;
1970
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1971 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return; 1972 return;
1823 1973
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1830 1980
1831 throttle_cfs_rq(cfs_rq); 1981 throttle_cfs_rq(cfs_rq);
1832} 1982}
1833#else 1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {} 2090 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1852{ 2107{
1853 return 0; 2108 return 0;
1854} 2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1855#endif 2115#endif
1856 2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1857/************************************************** 2126/**************************************************
1858 * CFS operations on tasks: 2127 * CFS operations on tasks:
1859 */ 2128 */
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1866 2135
1867 WARN_ON(task_rq(p) != rq); 2136 WARN_ON(task_rq(p) != rq);
1868 2137
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2138 if (cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se); 2139 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2140 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran; 2141 s64 delta = slice - ran;
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
1897{ 2166{
1898 struct task_struct *curr = rq->curr; 2167 struct task_struct *curr = rq->curr;
1899 2168
1900 if (curr->sched_class != &fair_sched_class) 2169 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1901 return; 2170 return;
1902 2171
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2172 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2020} 2289}
2021 2290
2022#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
2023 2347
2024static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
2025{ 2349{
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2327 int prev_cpu = task_cpu(p); 2651 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd; 2652 struct sched_domain *sd;
2329 struct sched_group *sg; 2653 struct sched_group *sg;
2330 int i, smt = 0; 2654 int i;
2331 2655
2332 /* 2656 /*
2333 * If the task is going to be woken-up on this cpu and if it is 2657 * If the task is going to be woken-up on this cpu and if it is
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
2347 * Otherwise, iterate the domains and find an elegible idle cpu. 2671 * Otherwise, iterate the domains and find an elegible idle cpu.
2348 */ 2672 */
2349 rcu_read_lock(); 2673 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
2356 break;
2357
2358 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2359 break;
2360 2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) {
2361 sg = sd->groups; 2677 sg = sd->groups;
2362 do { 2678 do {
2363 if (!cpumask_intersects(sched_group_cpus(sg), 2679 if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2376,10 +2692,6 @@ next:
2376 sg = sg->next; 2692 sg = sg->next;
2377 } while (sg != sd->groups); 2693 } while (sg != sd->groups);
2378 } 2694 }
2379 if (!smt) {
2380 smt = 1;
2381 goto again;
2382 }
2383done: 2695done:
2384 rcu_read_unlock(); 2696 rcu_read_unlock();
2385 2697
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2408 int want_sd = 1; 2720 int want_sd = 1;
2409 int sync = wake_flags & WF_SYNC; 2721 int sync = wake_flags & WF_SYNC;
2410 2722
2723 if (p->rt.nr_cpus_allowed == 1)
2724 return prev_cpu;
2725
2411 if (sd_flag & SD_BALANCE_WAKE) { 2726 if (sd_flag & SD_BALANCE_WAKE) {
2412 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2727 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2413 want_affine = 1; 2728 want_affine = 1;
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2692 } while (cfs_rq); 3007 } while (cfs_rq);
2693 3008
2694 p = task_of(se); 3009 p = task_of(se);
2695 hrtick_start_fair(rq, p); 3010 if (hrtick_enabled(rq))
3011 hrtick_start_fair(rq, p);
2696 3012
2697 return p; 3013 return p;
2698} 3014}
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
2736 * Update run-time statistics of the 'current'. 3052 * Update run-time statistics of the 'current'.
2737 */ 3053 */
2738 update_curr(cfs_rq); 3054 update_curr(cfs_rq);
3055 /*
3056 * Tell update_rq_clock() that we've just updated,
3057 * so we don't do microscopic update in schedule()
3058 * and double the fastpath cost.
3059 */
3060 rq->skip_clock_update = 1;
2739 } 3061 }
2740 3062
2741 set_skip_buddy(se); 3063 set_skip_buddy(se);
@@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2776} 3098}
2777 3099
2778/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02
3134#define LBF_ABORT 0x04
3135
3136/*
2779 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3137 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2780 */ 3138 */
2781static 3139static
2782int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3140int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2783 struct sched_domain *sd, enum cpu_idle_type idle, 3141 struct sched_domain *sd, enum cpu_idle_type idle,
2784 int *all_pinned) 3142 int *lb_flags)
2785{ 3143{
2786 int tsk_cache_hot = 0; 3144 int tsk_cache_hot = 0;
2787 /* 3145 /*
@@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2794 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3152 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2795 return 0; 3153 return 0;
2796 } 3154 }
2797 *all_pinned = 0; 3155 *lb_flags &= ~LBF_ALL_PINNED;
2798 3156
2799 if (task_running(rq, p)) { 3157 if (task_running(rq, p)) {
2800 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3158 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2868static unsigned long 3226static unsigned long
2869balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3227balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2870 unsigned long max_load_move, struct sched_domain *sd, 3228 unsigned long max_load_move, struct sched_domain *sd,
2871 enum cpu_idle_type idle, int *all_pinned, 3229 enum cpu_idle_type idle, int *lb_flags,
2872 struct cfs_rq *busiest_cfs_rq) 3230 struct cfs_rq *busiest_cfs_rq)
2873{ 3231{
2874 int loops = 0, pulled = 0; 3232 int loops = 0, pulled = 0;
@@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2879 goto out; 3237 goto out;
2880 3238
2881 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3239 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2882 if (loops++ > sysctl_sched_nr_migrate) 3240 if (loops++ > sysctl_sched_nr_migrate) {
3241 *lb_flags |= LBF_NEED_BREAK;
2883 break; 3242 break;
3243 }
2884 3244
2885 if ((p->se.load.weight >> 1) > rem_load_move || 3245 if ((p->se.load.weight >> 1) > rem_load_move ||
2886 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3246 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2887 all_pinned)) 3247 lb_flags))
2888 continue; 3248 continue;
2889 3249
2890 pull_task(busiest, p, this_rq, this_cpu); 3250 pull_task(busiest, p, this_rq, this_cpu);
@@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2897 * kernels will stop after the first task is pulled to minimize 3257 * kernels will stop after the first task is pulled to minimize
2898 * the critical section. 3258 * the critical section.
2899 */ 3259 */
2900 if (idle == CPU_NEWLY_IDLE) 3260 if (idle == CPU_NEWLY_IDLE) {
3261 *lb_flags |= LBF_ABORT;
2901 break; 3262 break;
3263 }
2902#endif 3264#endif
2903 3265
2904 /* 3266 /*
@@ -3003,7 +3365,7 @@ static unsigned long
3003load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3365load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3004 unsigned long max_load_move, 3366 unsigned long max_load_move,
3005 struct sched_domain *sd, enum cpu_idle_type idle, 3367 struct sched_domain *sd, enum cpu_idle_type idle,
3006 int *all_pinned) 3368 int *lb_flags)
3007{ 3369{
3008 long rem_load_move = max_load_move; 3370 long rem_load_move = max_load_move;
3009 struct cfs_rq *busiest_cfs_rq; 3371 struct cfs_rq *busiest_cfs_rq;
@@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3016 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3378 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3017 u64 rem_load, moved_load; 3379 u64 rem_load, moved_load;
3018 3380
3381 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3382 break;
3383
3019 /* 3384 /*
3020 * empty group or part of a throttled hierarchy 3385 * empty group or part of a throttled hierarchy
3021 */ 3386 */
@@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3027 rem_load = div_u64(rem_load, busiest_h_load + 1); 3392 rem_load = div_u64(rem_load, busiest_h_load + 1);
3028 3393
3029 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3394 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3030 rem_load, sd, idle, all_pinned, 3395 rem_load, sd, idle, lb_flags,
3031 busiest_cfs_rq); 3396 busiest_cfs_rq);
3032 3397
3033 if (!moved_load) 3398 if (!moved_load)
@@ -3053,10 +3418,10 @@ static unsigned long
3053load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3418load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3054 unsigned long max_load_move, 3419 unsigned long max_load_move,
3055 struct sched_domain *sd, enum cpu_idle_type idle, 3420 struct sched_domain *sd, enum cpu_idle_type idle,
3056 int *all_pinned) 3421 int *lb_flags)
3057{ 3422{
3058 return balance_tasks(this_rq, this_cpu, busiest, 3423 return balance_tasks(this_rq, this_cpu, busiest,
3059 max_load_move, sd, idle, all_pinned, 3424 max_load_move, sd, idle, lb_flags,
3060 &busiest->cfs); 3425 &busiest->cfs);
3061} 3426}
3062#endif 3427#endif
@@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3071static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3436static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3072 unsigned long max_load_move, 3437 unsigned long max_load_move,
3073 struct sched_domain *sd, enum cpu_idle_type idle, 3438 struct sched_domain *sd, enum cpu_idle_type idle,
3074 int *all_pinned) 3439 int *lb_flags)
3075{ 3440{
3076 unsigned long total_load_moved = 0, load_moved; 3441 unsigned long total_load_moved = 0, load_moved;
3077 3442
3078 do { 3443 do {
3079 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3444 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3080 max_load_move - total_load_moved, 3445 max_load_move - total_load_moved,
3081 sd, idle, all_pinned); 3446 sd, idle, lb_flags);
3082 3447
3083 total_load_moved += load_moved; 3448 total_load_moved += load_moved;
3084 3449
3450 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3451 break;
3452
3085#ifdef CONFIG_PREEMPT 3453#ifdef CONFIG_PREEMPT
3086 /* 3454 /*
3087 * NEWIDLE balancing is a source of latency, so preemptible 3455 * NEWIDLE balancing is a source of latency, so preemptible
3088 * kernels will stop after the first task is pulled to minimize 3456 * kernels will stop after the first task is pulled to minimize
3089 * the critical section. 3457 * the critical section.
3090 */ 3458 */
3091 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3459 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3092 break; 3460 *lb_flags |= LBF_ABORT;
3093
3094 if (raw_spin_is_contended(&this_rq->lock) ||
3095 raw_spin_is_contended(&busiest->lock))
3096 break; 3461 break;
3462 }
3097#endif 3463#endif
3098 } while (load_moved && max_load_move > total_load_moved); 3464 } while (load_moved && max_load_move > total_load_moved);
3099 3465
@@ -3155,15 +3521,6 @@ struct sg_lb_stats {
3155}; 3521};
3156 3522
3157/** 3523/**
3158 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3159 * @group: The group whose first cpu is to be returned.
3160 */
3161static inline unsigned int group_first_cpu(struct sched_group *group)
3162{
3163 return cpumask_first(sched_group_cpus(group));
3164}
3165
3166/**
3167 * get_sd_load_idx - Obtain the load index for a given sched domain. 3524 * get_sd_load_idx - Obtain the load index for a given sched domain.
3168 * @sd: The sched_domain whose load_idx is to be obtained. 3525 * @sd: The sched_domain whose load_idx is to be obtained.
3169 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3526 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3412 sdg->sgp->power = power; 3769 sdg->sgp->power = power;
3413} 3770}
3414 3771
3415static void update_group_power(struct sched_domain *sd, int cpu) 3772void update_group_power(struct sched_domain *sd, int cpu)
3416{ 3773{
3417 struct sched_domain *child = sd->child; 3774 struct sched_domain *child = sd->child;
3418 struct sched_group *group, *sdg = sd->groups; 3775 struct sched_group *group, *sdg = sd->groups;
@@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3678 } while (sg != sd->groups); 4035 } while (sg != sd->groups);
3679} 4036}
3680 4037
3681int __weak arch_sd_sibling_asym_packing(void)
3682{
3683 return 0*SD_ASYM_PACKING;
3684}
3685
3686/** 4038/**
3687 * check_asym_packing - Check to see if the group is packed into the 4039 * check_asym_packing - Check to see if the group is packed into the
3688 * sched doman. 4040 * sched doman.
@@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4046#define MAX_PINNED_INTERVAL 512 4398#define MAX_PINNED_INTERVAL 512
4047 4399
4048/* Working cpumask for load_balance and load_balance_newidle. */ 4400/* Working cpumask for load_balance and load_balance_newidle. */
4049static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4401DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4050 4402
4051static int need_active_balance(struct sched_domain *sd, int idle, 4403static int need_active_balance(struct sched_domain *sd, int idle,
4052 int busiest_cpu, int this_cpu) 4404 int busiest_cpu, int this_cpu)
@@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4097 struct sched_domain *sd, enum cpu_idle_type idle, 4449 struct sched_domain *sd, enum cpu_idle_type idle,
4098 int *balance) 4450 int *balance)
4099{ 4451{
4100 int ld_moved, all_pinned = 0, active_balance = 0; 4452 int ld_moved, lb_flags = 0, active_balance = 0;
4101 struct sched_group *group; 4453 struct sched_group *group;
4102 unsigned long imbalance; 4454 unsigned long imbalance;
4103 struct rq *busiest; 4455 struct rq *busiest;
@@ -4138,11 +4490,11 @@ redo:
4138 * still unbalanced. ld_moved simply stays zero, so it is 4490 * still unbalanced. ld_moved simply stays zero, so it is
4139 * correctly treated as an imbalance. 4491 * correctly treated as an imbalance.
4140 */ 4492 */
4141 all_pinned = 1; 4493 lb_flags |= LBF_ALL_PINNED;
4142 local_irq_save(flags); 4494 local_irq_save(flags);
4143 double_rq_lock(this_rq, busiest); 4495 double_rq_lock(this_rq, busiest);
4144 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4496 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4145 imbalance, sd, idle, &all_pinned); 4497 imbalance, sd, idle, &lb_flags);
4146 double_rq_unlock(this_rq, busiest); 4498 double_rq_unlock(this_rq, busiest);
4147 local_irq_restore(flags); 4499 local_irq_restore(flags);
4148 4500
@@ -4152,8 +4504,16 @@ redo:
4152 if (ld_moved && this_cpu != smp_processor_id()) 4504 if (ld_moved && this_cpu != smp_processor_id())
4153 resched_cpu(this_cpu); 4505 resched_cpu(this_cpu);
4154 4506
4507 if (lb_flags & LBF_ABORT)
4508 goto out_balanced;
4509
4510 if (lb_flags & LBF_NEED_BREAK) {
4511 lb_flags &= ~LBF_NEED_BREAK;
4512 goto redo;
4513 }
4514
4155 /* All tasks on this runqueue were pinned by CPU affinity */ 4515 /* All tasks on this runqueue were pinned by CPU affinity */
4156 if (unlikely(all_pinned)) { 4516 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
4157 cpumask_clear_cpu(cpu_of(busiest), cpus); 4517 cpumask_clear_cpu(cpu_of(busiest), cpus);
4158 if (!cpumask_empty(cpus)) 4518 if (!cpumask_empty(cpus))
4159 goto redo; 4519 goto redo;
@@ -4183,7 +4543,7 @@ redo:
4183 tsk_cpus_allowed(busiest->curr))) { 4543 tsk_cpus_allowed(busiest->curr))) {
4184 raw_spin_unlock_irqrestore(&busiest->lock, 4544 raw_spin_unlock_irqrestore(&busiest->lock,
4185 flags); 4545 flags);
4186 all_pinned = 1; 4546 lb_flags |= LBF_ALL_PINNED;
4187 goto out_one_pinned; 4547 goto out_one_pinned;
4188 } 4548 }
4189 4549
@@ -4236,7 +4596,8 @@ out_balanced:
4236 4596
4237out_one_pinned: 4597out_one_pinned:
4238 /* tune up the balancing interval */ 4598 /* tune up the balancing interval */
4239 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4599 if (((lb_flags & LBF_ALL_PINNED) &&
4600 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4240 (sd->balance_interval < sd->max_interval)) 4601 (sd->balance_interval < sd->max_interval))
4241 sd->balance_interval *= 2; 4602 sd->balance_interval *= 2;
4242 4603
@@ -4249,7 +4610,7 @@ out:
4249 * idle_balance is called by schedule() if this_cpu is about to become 4610 * idle_balance is called by schedule() if this_cpu is about to become
4250 * idle. Attempts to pull tasks from other CPUs. 4611 * idle. Attempts to pull tasks from other CPUs.
4251 */ 4612 */
4252static void idle_balance(int this_cpu, struct rq *this_rq) 4613void idle_balance(int this_cpu, struct rq *this_rq)
4253{ 4614{
4254 struct sched_domain *sd; 4615 struct sched_domain *sd;
4255 int pulled_task = 0; 4616 int pulled_task = 0;
@@ -4364,28 +4725,16 @@ out_unlock:
4364#ifdef CONFIG_NO_HZ 4725#ifdef CONFIG_NO_HZ
4365/* 4726/*
4366 * idle load balancing details 4727 * idle load balancing details
4367 * - One of the idle CPUs nominates itself as idle load_balancer, while
4368 * entering idle.
4369 * - This idle load balancer CPU will also go into tickless mode when
4370 * it is idle, just like all other idle CPUs
4371 * - When one of the busy CPUs notice that there may be an idle rebalancing 4728 * - When one of the busy CPUs notice that there may be an idle rebalancing
4372 * needed, they will kick the idle load balancer, which then does idle 4729 * needed, they will kick the idle load balancer, which then does idle
4373 * load balancing for all the idle CPUs. 4730 * load balancing for all the idle CPUs.
4374 */ 4731 */
4375static struct { 4732static struct {
4376 atomic_t load_balancer;
4377 atomic_t first_pick_cpu;
4378 atomic_t second_pick_cpu;
4379 cpumask_var_t idle_cpus_mask; 4733 cpumask_var_t idle_cpus_mask;
4380 cpumask_var_t grp_idle_mask; 4734 atomic_t nr_cpus;
4381 unsigned long next_balance; /* in jiffy units */ 4735 unsigned long next_balance; /* in jiffy units */
4382} nohz ____cacheline_aligned; 4736} nohz ____cacheline_aligned;
4383 4737
4384int get_nohz_load_balancer(void)
4385{
4386 return atomic_read(&nohz.load_balancer);
4387}
4388
4389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4738#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4390/** 4739/**
4391 * lowest_flag_domain - Return lowest sched_domain containing flag. 4740 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4422 (sd && (sd->flags & flag)); sd = sd->parent) 4771 (sd && (sd->flags & flag)); sd = sd->parent)
4423 4772
4424/** 4773/**
4425 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4426 * @ilb_group: group to be checked for semi-idleness
4427 *
4428 * Returns: 1 if the group is semi-idle. 0 otherwise.
4429 *
4430 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4431 * and atleast one non-idle CPU. This helper function checks if the given
4432 * sched_group is semi-idle or not.
4433 */
4434static inline int is_semi_idle_group(struct sched_group *ilb_group)
4435{
4436 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4437 sched_group_cpus(ilb_group));
4438
4439 /*
4440 * A sched_group is semi-idle when it has atleast one busy cpu
4441 * and atleast one idle cpu.
4442 */
4443 if (cpumask_empty(nohz.grp_idle_mask))
4444 return 0;
4445
4446 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4447 return 0;
4448
4449 return 1;
4450}
4451/**
4452 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4774 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4453 * @cpu: The cpu which is nominating a new idle_load_balancer. 4775 * @cpu: The cpu which is nominating a new idle_load_balancer.
4454 * 4776 *
@@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4462 */ 4784 */
4463static int find_new_ilb(int cpu) 4785static int find_new_ilb(int cpu)
4464{ 4786{
4787 int ilb = cpumask_first(nohz.idle_cpus_mask);
4788 struct sched_group *ilbg;
4465 struct sched_domain *sd; 4789 struct sched_domain *sd;
4466 struct sched_group *ilb_group;
4467 int ilb = nr_cpu_ids;
4468 4790
4469 /* 4791 /*
4470 * Have idle load balancer selection from semi-idle packages only 4792 * Have idle load balancer selection from semi-idle packages only
@@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu)
4482 4804
4483 rcu_read_lock(); 4805 rcu_read_lock();
4484 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4806 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4485 ilb_group = sd->groups; 4807 ilbg = sd->groups;
4486 4808
4487 do { 4809 do {
4488 if (is_semi_idle_group(ilb_group)) { 4810 if (ilbg->group_weight !=
4489 ilb = cpumask_first(nohz.grp_idle_mask); 4811 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4812 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4813 sched_group_cpus(ilbg));
4490 goto unlock; 4814 goto unlock;
4491 } 4815 }
4492 4816
4493 ilb_group = ilb_group->next; 4817 ilbg = ilbg->next;
4494 4818
4495 } while (ilb_group != sd->groups); 4819 } while (ilbg != sd->groups);
4496 } 4820 }
4497unlock: 4821unlock:
4498 rcu_read_unlock(); 4822 rcu_read_unlock();
4499 4823
4500out_done: 4824out_done:
4501 return ilb; 4825 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4826 return ilb;
4827
4828 return nr_cpu_ids;
4502} 4829}
4503#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4830#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4504static inline int find_new_ilb(int call_cpu) 4831static inline int find_new_ilb(int call_cpu)
@@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu)
4518 4845
4519 nohz.next_balance++; 4846 nohz.next_balance++;
4520 4847
4521 ilb_cpu = get_nohz_load_balancer(); 4848 ilb_cpu = find_new_ilb(cpu);
4522
4523 if (ilb_cpu >= nr_cpu_ids) {
4524 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4525 if (ilb_cpu >= nr_cpu_ids)
4526 return;
4527 }
4528 4849
4529 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4850 if (ilb_cpu >= nr_cpu_ids)
4530 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4851 return;
4531 4852
4532 smp_mb(); 4853 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4533 /* 4854 return;
4534 * Use smp_send_reschedule() instead of resched_cpu(). 4855 /*
4535 * This way we generate a sched IPI on the target cpu which 4856 * Use smp_send_reschedule() instead of resched_cpu().
4536 * is idle. And the softirq performing nohz idle load balance 4857 * This way we generate a sched IPI on the target cpu which
4537 * will be run before returning from the IPI. 4858 * is idle. And the softirq performing nohz idle load balance
4538 */ 4859 * will be run before returning from the IPI.
4539 smp_send_reschedule(ilb_cpu); 4860 */
4540 } 4861 smp_send_reschedule(ilb_cpu);
4541 return; 4862 return;
4542} 4863}
4543 4864
4544/* 4865static inline void set_cpu_sd_state_busy(void)
4545 * This routine will try to nominate the ilb (idle load balancing)
4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4547 * load balancing on behalf of all those cpus.
4548 *
4549 * When the ilb owner becomes busy, we will not have new ilb owner until some
4550 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4551 * idle load balancing by kicking one of the idle CPUs.
4552 *
4553 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4554 * ilb owner CPU in future (when there is a need for idle load balancing on
4555 * behalf of all idle CPUs).
4556 */
4557void select_nohz_load_balancer(int stop_tick)
4558{ 4866{
4867 struct sched_domain *sd;
4559 int cpu = smp_processor_id(); 4868 int cpu = smp_processor_id();
4560 4869
4561 if (stop_tick) { 4870 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4562 if (!cpu_active(cpu)) { 4871 return;
4563 if (atomic_read(&nohz.load_balancer) != cpu) 4872 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4564 return;
4565
4566 /*
4567 * If we are going offline and still the leader,
4568 * give up!
4569 */
4570 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4571 nr_cpu_ids) != cpu)
4572 BUG();
4573 4873
4574 return; 4874 rcu_read_lock();
4575 } 4875 for_each_domain(cpu, sd)
4876 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4877 rcu_read_unlock();
4878}
4576 4879
4577 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4880void set_cpu_sd_state_idle(void)
4881{
4882 struct sched_domain *sd;
4883 int cpu = smp_processor_id();
4578 4884
4579 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4885 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4580 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4886 return;
4581 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4887 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4582 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4583 4888
4584 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4889 rcu_read_lock();
4585 int new_ilb; 4890 for_each_domain(cpu, sd)
4891 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4892 rcu_read_unlock();
4893}
4586 4894
4587 /* make me the ilb owner */ 4895/*
4588 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4896 * This routine will record that this cpu is going idle with tick stopped.
4589 cpu) != nr_cpu_ids) 4897 * This info will be used in performing idle load balancing in the future.
4590 return; 4898 */
4899void select_nohz_load_balancer(int stop_tick)
4900{
4901 int cpu = smp_processor_id();
4591 4902
4592 /* 4903 if (stop_tick) {
4593 * Check to see if there is a more power-efficient 4904 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4594 * ilb.
4595 */
4596 new_ilb = find_new_ilb(cpu);
4597 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4598 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4599 resched_cpu(new_ilb);
4600 return;
4601 }
4602 return;
4603 }
4604 } else {
4605 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4606 return; 4905 return;
4607 4906
4608 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4907 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4609 4908 atomic_inc(&nohz.nr_cpus);
4610 if (atomic_read(&nohz.load_balancer) == cpu) 4909 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4611 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4612 nr_cpu_ids) != cpu)
4613 BUG();
4614 } 4910 }
4615 return; 4911 return;
4616} 4912}
@@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4624 * Scale the max load_balance interval with the number of CPUs in the system. 4920 * Scale the max load_balance interval with the number of CPUs in the system.
4625 * This trades load-balance latency on larger machines for less cross talk. 4921 * This trades load-balance latency on larger machines for less cross talk.
4626 */ 4922 */
4627static void update_max_interval(void) 4923void update_max_interval(void)
4628{ 4924{
4629 max_load_balance_interval = HZ*num_online_cpus()/10; 4925 max_load_balance_interval = HZ*num_online_cpus()/10;
4630} 4926}
@@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4716 struct rq *rq; 5012 struct rq *rq;
4717 int balance_cpu; 5013 int balance_cpu;
4718 5014
4719 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5015 if (idle != CPU_IDLE ||
4720 return; 5016 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5017 goto end;
4721 5018
4722 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5019 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4723 if (balance_cpu == this_cpu) 5020 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4724 continue; 5021 continue;
4725 5022
4726 /* 5023 /*
@@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4728 * work being done for other cpus. Next load 5025 * work being done for other cpus. Next load
4729 * balancing owner will pick it up. 5026 * balancing owner will pick it up.
4730 */ 5027 */
4731 if (need_resched()) { 5028 if (need_resched())
4732 this_rq->nohz_balance_kick = 0;
4733 break; 5029 break;
4734 }
4735 5030
4736 raw_spin_lock_irq(&this_rq->lock); 5031 raw_spin_lock_irq(&this_rq->lock);
4737 update_rq_clock(this_rq); 5032 update_rq_clock(this_rq);
@@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4745 this_rq->next_balance = rq->next_balance; 5040 this_rq->next_balance = rq->next_balance;
4746 } 5041 }
4747 nohz.next_balance = this_rq->next_balance; 5042 nohz.next_balance = this_rq->next_balance;
4748 this_rq->nohz_balance_kick = 0; 5043end:
5044 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4749} 5045}
4750 5046
4751/* 5047/*
4752 * Current heuristic for kicking the idle load balancer 5048 * Current heuristic for kicking the idle load balancer in the presence
4753 * - first_pick_cpu is the one of the busy CPUs. It will kick 5049 * of an idle cpu is the system.
4754 * idle load balancer when it has more than one process active. This 5050 * - This rq has more than one task.
4755 * eliminates the need for idle load balancing altogether when we have 5051 * - At any scheduler domain level, this cpu's scheduler group has multiple
4756 * only one running process in the system (common case). 5052 * busy cpu's exceeding the group's power.
4757 * - If there are more than one busy CPU, idle load balancer may have 5053 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4758 * to run for active_load_balance to happen (i.e., two busy CPUs are 5054 * domain span are idle.
4759 * SMT or core siblings and can run better if they move to different
4760 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4761 * which will kick idle load balancer as soon as it has any load.
4762 */ 5055 */
4763static inline int nohz_kick_needed(struct rq *rq, int cpu) 5056static inline int nohz_kick_needed(struct rq *rq, int cpu)
4764{ 5057{
4765 unsigned long now = jiffies; 5058 unsigned long now = jiffies;
4766 int ret; 5059 struct sched_domain *sd;
4767 int first_pick_cpu, second_pick_cpu;
4768 5060
4769 if (time_before(now, nohz.next_balance)) 5061 if (unlikely(idle_cpu(cpu)))
4770 return 0; 5062 return 0;
4771 5063
4772 if (idle_cpu(cpu)) 5064 /*
4773 return 0; 5065 * We may be recently in ticked or tickless idle mode. At the first
5066 * busy tick after returning from idle, we will update the busy stats.
5067 */
5068 set_cpu_sd_state_busy();
5069 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5070 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5071 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5072 atomic_dec(&nohz.nr_cpus);
5073 }
4774 5074
4775 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5075 /*
4776 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5076 * None are in tickless mode and hence no need for NOHZ idle load
5077 * balancing.
5078 */
5079 if (likely(!atomic_read(&nohz.nr_cpus)))
5080 return 0;
4777 5081
4778 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5082 if (time_before(now, nohz.next_balance))
4779 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4780 return 0; 5083 return 0;
4781 5084
4782 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5085 if (rq->nr_running >= 2)
4783 if (ret == nr_cpu_ids || ret == cpu) { 5086 goto need_kick;
4784 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5087
4785 if (rq->nr_running > 1) 5088 rcu_read_lock();
4786 return 1; 5089 for_each_domain(cpu, sd) {
4787 } else { 5090 struct sched_group *sg = sd->groups;
4788 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5091 struct sched_group_power *sgp = sg->sgp;
4789 if (ret == nr_cpu_ids || ret == cpu) { 5092 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4790 if (rq->nr_running) 5093
4791 return 1; 5094 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4792 } 5095 goto need_kick_unlock;
5096
5097 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5098 && (cpumask_first_and(nohz.idle_cpus_mask,
5099 sched_domain_span(sd)) < cpu))
5100 goto need_kick_unlock;
5101
5102 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5103 break;
4793 } 5104 }
5105 rcu_read_unlock();
4794 return 0; 5106 return 0;
5107
5108need_kick_unlock:
5109 rcu_read_unlock();
5110need_kick:
5111 return 1;
4795} 5112}
4796#else 5113#else
4797static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5114static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu)
4826/* 5143/*
4827 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5144 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4828 */ 5145 */
4829static inline void trigger_load_balance(struct rq *rq, int cpu) 5146void trigger_load_balance(struct rq *rq, int cpu)
4830{ 5147{
4831 /* Don't need to rebalance while attached to NULL domain */ 5148 /* Don't need to rebalance while attached to NULL domain */
4832 if (time_after_eq(jiffies, rq->next_balance) && 5149 if (time_after_eq(jiffies, rq->next_balance) &&
4833 likely(!on_null_domain(cpu))) 5150 likely(!on_null_domain(cpu)))
4834 raise_softirq(SCHED_SOFTIRQ); 5151 raise_softirq(SCHED_SOFTIRQ);
4835#ifdef CONFIG_NO_HZ 5152#ifdef CONFIG_NO_HZ
4836 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5153 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4837 nohz_balancer_kick(cpu); 5154 nohz_balancer_kick(cpu);
4838#endif 5155#endif
4839} 5156}
@@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq)
4848 update_sysctl(); 5165 update_sysctl();
4849} 5166}
4850 5167
4851#else /* CONFIG_SMP */
4852
4853/*
4854 * on UP we do not need to balance between CPUs:
4855 */
4856static inline void idle_balance(int cpu, struct rq *rq)
4857{
4858}
4859
4860#endif /* CONFIG_SMP */ 5168#endif /* CONFIG_SMP */
4861 5169
4862/* 5170/*
@@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4880 */ 5188 */
4881static void task_fork_fair(struct task_struct *p) 5189static void task_fork_fair(struct task_struct *p)
4882{ 5190{
4883 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5191 struct cfs_rq *cfs_rq;
4884 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5192 struct sched_entity *se = &p->se, *curr;
4885 int this_cpu = smp_processor_id(); 5193 int this_cpu = smp_processor_id();
4886 struct rq *rq = this_rq(); 5194 struct rq *rq = this_rq();
4887 unsigned long flags; 5195 unsigned long flags;
@@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p)
4890 5198
4891 update_rq_clock(rq); 5199 update_rq_clock(rq);
4892 5200
5201 cfs_rq = task_cfs_rq(current);
5202 curr = cfs_rq->curr;
5203
4893 if (unlikely(task_cpu(p) != this_cpu)) { 5204 if (unlikely(task_cpu(p) != this_cpu)) {
4894 rcu_read_lock(); 5205 rcu_read_lock();
4895 __set_task_cpu(p, this_cpu); 5206 __set_task_cpu(p, this_cpu);
@@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq)
4999 } 5310 }
5000} 5311}
5001 5312
5313void init_cfs_rq(struct cfs_rq *cfs_rq)
5314{
5315 cfs_rq->tasks_timeline = RB_ROOT;
5316 INIT_LIST_HEAD(&cfs_rq->tasks);
5317 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5318#ifndef CONFIG_64BIT
5319 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5320#endif
5321}
5322
5002#ifdef CONFIG_FAIR_GROUP_SCHED 5323#ifdef CONFIG_FAIR_GROUP_SCHED
5003static void task_move_group_fair(struct task_struct *p, int on_rq) 5324static void task_move_group_fair(struct task_struct *p, int on_rq)
5004{ 5325{
@@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5015 * to another cgroup's rq. This does somewhat interfere with the 5336 * to another cgroup's rq. This does somewhat interfere with the
5016 * fair sleeper stuff for the first placement, but who cares. 5337 * fair sleeper stuff for the first placement, but who cares.
5017 */ 5338 */
5339 /*
5340 * When !on_rq, vruntime of the task has usually NOT been normalized.
5341 * But there are some cases where it has already been normalized:
5342 *
5343 * - Moving a forked child which is waiting for being woken up by
5344 * wake_up_new_task().
5345 * - Moving a task which has been woken up by try_to_wake_up() and
5346 * waiting for actually being woken up by sched_ttwu_pending().
5347 *
5348 * To prevent boost or penalty in the new cfs_rq caused by delta
5349 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5350 */
5351 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5352 on_rq = 1;
5353
5018 if (!on_rq) 5354 if (!on_rq)
5019 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5355 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5020 set_task_rq(p, task_cpu(p)); 5356 set_task_rq(p, task_cpu(p));
5021 if (!on_rq) 5357 if (!on_rq)
5022 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5358 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5023} 5359}
5360
5361void free_fair_sched_group(struct task_group *tg)
5362{
5363 int i;
5364
5365 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5366
5367 for_each_possible_cpu(i) {
5368 if (tg->cfs_rq)
5369 kfree(tg->cfs_rq[i]);
5370 if (tg->se)
5371 kfree(tg->se[i]);
5372 }
5373
5374 kfree(tg->cfs_rq);
5375 kfree(tg->se);
5376}
5377
5378int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5379{
5380 struct cfs_rq *cfs_rq;
5381 struct sched_entity *se;
5382 int i;
5383
5384 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5385 if (!tg->cfs_rq)
5386 goto err;
5387 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5388 if (!tg->se)
5389 goto err;
5390
5391 tg->shares = NICE_0_LOAD;
5392
5393 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5394
5395 for_each_possible_cpu(i) {
5396 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5397 GFP_KERNEL, cpu_to_node(i));
5398 if (!cfs_rq)
5399 goto err;
5400
5401 se = kzalloc_node(sizeof(struct sched_entity),
5402 GFP_KERNEL, cpu_to_node(i));
5403 if (!se)
5404 goto err_free_rq;
5405
5406 init_cfs_rq(cfs_rq);
5407 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5408 }
5409
5410 return 1;
5411
5412err_free_rq:
5413 kfree(cfs_rq);
5414err:
5415 return 0;
5416}
5417
5418void unregister_fair_sched_group(struct task_group *tg, int cpu)
5419{
5420 struct rq *rq = cpu_rq(cpu);
5421 unsigned long flags;
5422
5423 /*
5424 * Only empty task groups can be destroyed; so we can speculatively
5425 * check on_list without danger of it being re-added.
5426 */
5427 if (!tg->cfs_rq[cpu]->on_list)
5428 return;
5429
5430 raw_spin_lock_irqsave(&rq->lock, flags);
5431 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5432 raw_spin_unlock_irqrestore(&rq->lock, flags);
5433}
5434
5435void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5436 struct sched_entity *se, int cpu,
5437 struct sched_entity *parent)
5438{
5439 struct rq *rq = cpu_rq(cpu);
5440
5441 cfs_rq->tg = tg;
5442 cfs_rq->rq = rq;
5443#ifdef CONFIG_SMP
5444 /* allow initial update_cfs_load() to truncate */
5445 cfs_rq->load_stamp = 1;
5024#endif 5446#endif
5447 init_cfs_rq_runtime(cfs_rq);
5448
5449 tg->cfs_rq[cpu] = cfs_rq;
5450 tg->se[cpu] = se;
5451
5452 /* se could be NULL for root_task_group */
5453 if (!se)
5454 return;
5455
5456 if (!parent)
5457 se->cfs_rq = &rq->cfs;
5458 else
5459 se->cfs_rq = parent->my_q;
5460
5461 se->my_q = cfs_rq;
5462 update_load_set(&se->load, 0);
5463 se->parent = parent;
5464}
5465
5466static DEFINE_MUTEX(shares_mutex);
5467
5468int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5469{
5470 int i;
5471 unsigned long flags;
5472
5473 /*
5474 * We can't change the weight of the root cgroup.
5475 */
5476 if (!tg->se[0])
5477 return -EINVAL;
5478
5479 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5480
5481 mutex_lock(&shares_mutex);
5482 if (tg->shares == shares)
5483 goto done;
5484
5485 tg->shares = shares;
5486 for_each_possible_cpu(i) {
5487 struct rq *rq = cpu_rq(i);
5488 struct sched_entity *se;
5489
5490 se = tg->se[i];
5491 /* Propagate contribution to hierarchy */
5492 raw_spin_lock_irqsave(&rq->lock, flags);
5493 for_each_sched_entity(se)
5494 update_cfs_shares(group_cfs_rq(se));
5495 raw_spin_unlock_irqrestore(&rq->lock, flags);
5496 }
5497
5498done:
5499 mutex_unlock(&shares_mutex);
5500 return 0;
5501}
5502#else /* CONFIG_FAIR_GROUP_SCHED */
5503
5504void free_fair_sched_group(struct task_group *tg) { }
5505
5506int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5507{
5508 return 1;
5509}
5510
5511void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5512
5513#endif /* CONFIG_FAIR_GROUP_SCHED */
5514
5025 5515
5026static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5516static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5027{ 5517{
@@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5041/* 5531/*
5042 * All the scheduling class methods: 5532 * All the scheduling class methods:
5043 */ 5533 */
5044static const struct sched_class fair_sched_class = { 5534const struct sched_class fair_sched_class = {
5045 .next = &idle_sched_class, 5535 .next = &idle_sched_class,
5046 .enqueue_task = enqueue_task_fair, 5536 .enqueue_task = enqueue_task_fair,
5047 .dequeue_task = dequeue_task_fair, 5537 .dequeue_task = dequeue_task_fair,
@@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = {
5078}; 5568};
5079 5569
5080#ifdef CONFIG_SCHED_DEBUG 5570#ifdef CONFIG_SCHED_DEBUG
5081static void print_cfs_stats(struct seq_file *m, int cpu) 5571void print_cfs_stats(struct seq_file *m, int cpu)
5082{ 5572{
5083 struct cfs_rq *cfs_rq; 5573 struct cfs_rq *cfs_rq;
5084 5574
@@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5088 rcu_read_unlock(); 5578 rcu_read_unlock();
5089} 5579}
5090#endif 5580#endif
5581
5582__init void init_sched_fair_class(void)
5583{
5584#ifdef CONFIG_SMP
5585 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5586
5587#ifdef CONFIG_NO_HZ
5588 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5589#endif
5590#endif /* SMP */
5591
5592}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd2..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe6..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
648 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
649 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
650 848
651 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
652 return 0; 850 return 0;
653 851
654 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
957} 1155}
958 1156
959/* 1157/*
960 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
961 * followed by enqueue. 1159 * dequeue followed by enqueue.
962 */ 1160 */
963static void 1161static void
964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1002 1200
1003 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1004 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1005 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out; 1208 goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1180 1381
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1383{
1185 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1852 pull_rt_task(rq);
1654} 1853}
1655 1854
1656static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1657{ 1856{
1658 unsigned int i; 1857 unsigned int i;
1659 1858
1660 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1663} 1863}
1664#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1665 1865
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 2000 return 0;
1801} 2001}
1802 2002
1803static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2037
1838static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2039{
1840 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
283 return; 180 return;
284 181
285 raw_spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime);
288 raw_spin_unlock(&cputimer->lock); 184 raw_spin_unlock(&cputimer->lock);
289} 185}
290 186
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
307 return; 203 return;
308 204
309 raw_spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime);
312 raw_spin_unlock(&cputimer->lock); 207 raw_spin_unlock(&cputimer->lock);
313} 208}
314 209
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index 206551563cce..56ce3a618b28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1629 info.si_uid = __task_cred(tsk)->uid; 1629 info.si_uid = __task_cred(tsk)->uid;
1630 rcu_read_unlock(); 1630 rcu_read_unlock();
1631 1631
1632 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1632 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1633 tsk->signal->utime)); 1633 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1634 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1635 tsk->signal->stime));
1636 1634
1637 info.si_status = tsk->exit_code & 0x7f; 1635 info.si_status = tsk->exit_code & 0x7f;
1638 if (tsk->exit_code & 0x80) 1636 if (tsk->exit_code & 0x80)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..ddf8155bf3f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1606 1606
1607 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1608 utime = stime = cputime_zero; 1608 utime = stime = 0;
1609 1609
1610 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1611 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1635 1635
1636 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1637 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1638 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1639 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1640 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1641 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1642 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
275} 275}
276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
277 277
278/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
279 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
280 *
281 * When the next event is more than a tick into the future, stop the idle tick
282 * Called either from the idle loop or from irq_exit() when an idle period was
283 * just interrupted by an interrupt which did not cause a reschedule.
284 */
285void tick_nohz_stop_sched_tick(int inidle)
286{ 279{
287 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
288 struct tick_sched *ts;
289 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
290 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
291 u64 time_delta; 283 u64 time_delta;
292 int cpu; 284 int cpu;
293 285
294 local_irq_save(flags);
295
296 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
297 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
298 288
299 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle.
303 */
304 if (!inidle && !ts->inidle)
305 goto end;
306
307 /*
308 * Set ts->inidle unconditionally. Even if the system did not
309 * switch to NOHZ mode the cpu frequency governers rely on the
310 * update of the idle time accounting in tick_nohz_start_idle().
311 */
312 ts->inidle = 1;
313
314 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
315 290
316 /* 291 /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
326 } 301 }
327 302
328 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
329 goto end; 304 return;
330 305
331 if (need_resched()) 306 if (need_resched())
332 goto end; 307 return;
333 308
334 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
335 static int ratelimit; 310 static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
339 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
340 ratelimit++; 315 ratelimit++;
341 } 316 }
342 goto end; 317 return;
343 } 318 }
344 319
345 ts->idle_calls++; 320 ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
434 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
435 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
436 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
437 rcu_enter_nohz();
438 } 412 }
439 413
440 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
472 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
473 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
474 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
475end: 449}
476 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
477 local_irq_disable();
478
479 ts = &__get_cpu_var(tick_cpu_sched);
480 /*
481 * set ts->inidle unconditionally. even if the system did not
482 * switch to nohz mode the cpu frequency governers rely on the
483 * update of the idle time accounting in tick_nohz_start_idle().
484 */
485 ts->inidle = 1;
486 tick_nohz_stop_sched_tick(ts);
487
488 local_irq_enable();
489}
490
491/**
492 * tick_nohz_irq_exit - update next tick event from interrupt exit
493 *
494 * When an interrupt fires while we are idle and it doesn't cause
495 * a reschedule, it may still add, modify or delete a timer, enqueue
496 * an RCU callback, etc...
497 * So we need to re-calculate and reprogram the next tick event.
498 */
499void tick_nohz_irq_exit(void)
500{
501 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
502
503 if (!ts->inidle)
504 return;
505
506 tick_nohz_stop_sched_tick(ts);
477} 507}
478 508
479/** 509/**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
515} 545}
516 546
517/** 547/**
518 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 548 * tick_nohz_idle_exit - restart the idle tick from the idle task
519 * 549 *
520 * Restart the idle tick when the CPU is woken up from idle 550 * Restart the idle tick when the CPU is woken up from idle
551 * This also exit the RCU extended quiescent state. The CPU
552 * can use RCU again after this function is called.
521 */ 553 */
522void tick_nohz_restart_sched_tick(void) 554void tick_nohz_idle_exit(void)
523{ 555{
524 int cpu = smp_processor_id(); 556 int cpu = smp_processor_id();
525 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 557 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
529 ktime_t now; 561 ktime_t now;
530 562
531 local_irq_disable(); 563 local_irq_disable();
564
532 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 565 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
533 now = ktime_get(); 566 now = ktime_get();
534 567
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
543 576
544 ts->inidle = 0; 577 ts->inidle = 0;
545 578
546 rcu_exit_nohz();
547
548 /* Update jiffies first */ 579 /* Update jiffies first */
549 select_nohz_load_balancer(0); 580 select_nohz_load_balancer(0);
550 tick_do_update_jiffies64(now); 581 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 237841378c03..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
131 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
133 133
134 /* return delta convert to nanoseconds using ntp adjusted mult. */ 134 /* return delta convert to nanoseconds. */
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 136}
137 137
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset)
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
814 * 814 *
815 * Note we subtract one in the shift, so that error is really error*2. 815 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) intererval twice, but keeps the 816 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparision as still measuring if error is 817 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 818 * larger then half an interval.
819 * 819 *
820 * Note: It does not "save" on aggrivation when reading the code. 820 * Note: It does not "save" on aggravation when reading the code.
821 */ 821 */
822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
823 if (error > interval) { 823 if (error > interval) {
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset)
833 * nanosecond, and store the amount rounded up into 833 * nanosecond, and store the amount rounded up into
834 * the error. This causes the likely below to be unlikely. 834 * the error. This causes the likely below to be unlikely.
835 * 835 *
836 * The properfix is to avoid rounding up by using 836 * The proper fix is to avoid rounding up by using
837 * the high precision timekeeper.xtime_nsec instead of 837 * the high precision timekeeper.xtime_nsec instead of
838 * xtime.tv_nsec everywhere. Fixing this will take some 838 * xtime.tv_nsec everywhere. Fixing this will take some
839 * time. 839 * time.
diff --git a/kernel/timer.c b/kernel/timer.c
index 9c3c62b0c4bc..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
427 } 427 }
428} 428}
429 429
430/* Stub timer callback for improperly used timers. */
431static void stub_timer(unsigned long data)
432{
433 WARN_ON(1);
434}
435
430/* 436/*
431 * fixup_activate is called when: 437 * fixup_activate is called when:
432 * - an active object is activated 438 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
450 debug_object_activate(timer, &timer_debug_descr); 456 debug_object_activate(timer, &timer_debug_descr);
451 return 0; 457 return 0;
452 } else { 458 } else {
453 WARN_ON_ONCE(1); 459 setup_timer(timer, stub_timer, 0);
460 return 1;
454 } 461 }
455 return 0; 462 return 0;
456 463
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
480 } 487 }
481} 488}
482 489
490/*
491 * fixup_assert_init is called when:
492 * - an untracked/uninit-ed object is found
493 */
494static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
495{
496 struct timer_list *timer = addr;
497
498 switch (state) {
499 case ODEBUG_STATE_NOTAVAILABLE:
500 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
501 /*
502 * This is not really a fixup. The timer was
503 * statically initialized. We just make sure that it
504 * is tracked in the object tracker.
505 */
506 debug_object_init(timer, &timer_debug_descr);
507 return 0;
508 } else {
509 setup_timer(timer, stub_timer, 0);
510 return 1;
511 }
512 default:
513 return 0;
514 }
515}
516
483static struct debug_obj_descr timer_debug_descr = { 517static struct debug_obj_descr timer_debug_descr = {
484 .name = "timer_list", 518 .name = "timer_list",
485 .debug_hint = timer_debug_hint, 519 .debug_hint = timer_debug_hint,
486 .fixup_init = timer_fixup_init, 520 .fixup_init = timer_fixup_init,
487 .fixup_activate = timer_fixup_activate, 521 .fixup_activate = timer_fixup_activate,
488 .fixup_free = timer_fixup_free, 522 .fixup_free = timer_fixup_free,
523 .fixup_assert_init = timer_fixup_assert_init,
489}; 524};
490 525
491static inline void debug_timer_init(struct timer_list *timer) 526static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
508 debug_object_free(timer, &timer_debug_descr); 543 debug_object_free(timer, &timer_debug_descr);
509} 544}
510 545
546static inline void debug_timer_assert_init(struct timer_list *timer)
547{
548 debug_object_assert_init(timer, &timer_debug_descr);
549}
550
511static void __init_timer(struct timer_list *timer, 551static void __init_timer(struct timer_list *timer,
512 const char *name, 552 const char *name,
513 struct lock_class_key *key); 553 struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
531static inline void debug_timer_init(struct timer_list *timer) { } 571static inline void debug_timer_init(struct timer_list *timer) { }
532static inline void debug_timer_activate(struct timer_list *timer) { } 572static inline void debug_timer_activate(struct timer_list *timer) { }
533static inline void debug_timer_deactivate(struct timer_list *timer) { } 573static inline void debug_timer_deactivate(struct timer_list *timer) { }
574static inline void debug_timer_assert_init(struct timer_list *timer) { }
534#endif 575#endif
535 576
536static inline void debug_init(struct timer_list *timer) 577static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
552 trace_timer_cancel(timer); 593 trace_timer_cancel(timer);
553} 594}
554 595
596static inline void debug_assert_init(struct timer_list *timer)
597{
598 debug_timer_assert_init(timer);
599}
600
555static void __init_timer(struct timer_list *timer, 601static void __init_timer(struct timer_list *timer,
556 const char *name, 602 const char *name,
557 struct lock_class_key *key) 603 struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
902 unsigned long flags; 948 unsigned long flags;
903 int ret = 0; 949 int ret = 0;
904 950
951 debug_assert_init(timer);
952
905 timer_stats_timer_clear_start_info(timer); 953 timer_stats_timer_clear_start_info(timer);
906 if (timer_pending(timer)) { 954 if (timer_pending(timer)) {
907 base = lock_timer_base(timer, &flags); 955 base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
932 unsigned long flags; 980 unsigned long flags;
933 int ret = -1; 981 int ret = -1;
934 982
983 debug_assert_init(timer);
984
935 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
936 986
937 if (base->running_timer == timer) 987 if (base->running_timer == timer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..91dc4bc8bf72 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
338/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
342 TRACE_ITER_IRQ_INFO;
342 343
343static int trace_stop_count; 344static int trace_stop_count;
344static DEFINE_RAW_SPINLOCK(tracing_start_lock); 345static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
426 "record-cmd", 427 "record-cmd",
427 "overwrite", 428 "overwrite",
428 "disable_on_free", 429 "disable_on_free",
430 "irq-info",
429 NULL 431 NULL
430}; 432};
431 433
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
1843 trace_event_read_unlock(); 1845 trace_event_read_unlock();
1844} 1846}
1845 1847
1848static void
1849get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
1850{
1851 unsigned long count;
1852 int cpu;
1853
1854 *total = 0;
1855 *entries = 0;
1856
1857 for_each_tracing_cpu(cpu) {
1858 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1859 /*
1860 * If this buffer has skipped entries, then we hold all
1861 * entries for the trace and we need to ignore the
1862 * ones before the time stamp.
1863 */
1864 if (tr->data[cpu]->skipped_entries) {
1865 count -= tr->data[cpu]->skipped_entries;
1866 /* total is the same as the entries */
1867 *total += count;
1868 } else
1869 *total += count +
1870 ring_buffer_overrun_cpu(tr->buffer, cpu);
1871 *entries += count;
1872 }
1873}
1874
1846static void print_lat_help_header(struct seq_file *m) 1875static void print_lat_help_header(struct seq_file *m)
1847{ 1876{
1848 seq_puts(m, "# _------=> CPU# \n"); 1877 seq_puts(m, "# _------=> CPU# \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
1855 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1884 seq_puts(m, "# \\ / ||||| \\ | / \n");
1856} 1885}
1857 1886
1858static void print_func_help_header(struct seq_file *m) 1887static void print_event_info(struct trace_array *tr, struct seq_file *m)
1888{
1889 unsigned long total;
1890 unsigned long entries;
1891
1892 get_total_entries(tr, &total, &entries);
1893 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
1894 entries, total, num_online_cpus());
1895 seq_puts(m, "#\n");
1896}
1897
1898static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
1859{ 1899{
1860 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1900 print_event_info(tr, m);
1901 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1861 seq_puts(m, "# | | | | |\n"); 1902 seq_puts(m, "# | | | | |\n");
1862} 1903}
1863 1904
1905static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
1906{
1907 print_event_info(tr, m);
1908 seq_puts(m, "# _-----=> irqs-off\n");
1909 seq_puts(m, "# / _----=> need-resched\n");
1910 seq_puts(m, "# | / _---=> hardirq/softirq\n");
1911 seq_puts(m, "# || / _--=> preempt-depth\n");
1912 seq_puts(m, "# ||| / delay\n");
1913 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
1914 seq_puts(m, "# | | | |||| | |\n");
1915}
1864 1916
1865void 1917void
1866print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1918print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1869 struct trace_array *tr = iter->tr; 1921 struct trace_array *tr = iter->tr;
1870 struct trace_array_cpu *data = tr->data[tr->cpu]; 1922 struct trace_array_cpu *data = tr->data[tr->cpu];
1871 struct tracer *type = current_trace; 1923 struct tracer *type = current_trace;
1872 unsigned long entries = 0; 1924 unsigned long entries;
1873 unsigned long total = 0; 1925 unsigned long total;
1874 unsigned long count;
1875 const char *name = "preemption"; 1926 const char *name = "preemption";
1876 int cpu;
1877 1927
1878 if (type) 1928 if (type)
1879 name = type->name; 1929 name = type->name;
1880 1930
1881 1931 get_total_entries(tr, &total, &entries);
1882 for_each_tracing_cpu(cpu) {
1883 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1884 /*
1885 * If this buffer has skipped entries, then we hold all
1886 * entries for the trace and we need to ignore the
1887 * ones before the time stamp.
1888 */
1889 if (tr->data[cpu]->skipped_entries) {
1890 count -= tr->data[cpu]->skipped_entries;
1891 /* total is the same as the entries */
1892 total += count;
1893 } else
1894 total += count +
1895 ring_buffer_overrun_cpu(tr->buffer, cpu);
1896 entries += count;
1897 }
1898 1932
1899 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1933 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1900 name, UTS_RELEASE); 1934 name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2140 return print_trace_fmt(iter); 2174 return print_trace_fmt(iter);
2141} 2175}
2142 2176
2177void trace_latency_header(struct seq_file *m)
2178{
2179 struct trace_iterator *iter = m->private;
2180
2181 /* print nothing if the buffers are empty */
2182 if (trace_empty(iter))
2183 return;
2184
2185 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2186 print_trace_header(m, iter);
2187
2188 if (!(trace_flags & TRACE_ITER_VERBOSE))
2189 print_lat_help_header(m);
2190}
2191
2143void trace_default_header(struct seq_file *m) 2192void trace_default_header(struct seq_file *m)
2144{ 2193{
2145 struct trace_iterator *iter = m->private; 2194 struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
2155 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2204 if (!(trace_flags & TRACE_ITER_VERBOSE))
2156 print_lat_help_header(m); 2205 print_lat_help_header(m);
2157 } else { 2206 } else {
2158 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2207 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2159 print_func_help_header(m); 2208 if (trace_flags & TRACE_ITER_IRQ_INFO)
2209 print_func_help_header_irq(iter->tr, m);
2210 else
2211 print_func_help_header(iter->tr, m);
2212 }
2160 } 2213 }
2161} 2214}
2162 2215
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4775{ 4828{
4776 __ftrace_dump(true, oops_dump_mode); 4829 __ftrace_dump(true, oops_dump_mode);
4777} 4830}
4831EXPORT_SYMBOL_GPL(ftrace_dump);
4778 4832
4779__init static int tracer_alloc_buffers(void) 4833__init static int tracer_alloc_buffers(void)
4780{ 4834{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..2c2657462ac3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
370 unsigned long ip, 370 unsigned long ip,
371 unsigned long parent_ip, 371 unsigned long parent_ip,
372 unsigned long flags, int pc); 372 unsigned long flags, int pc);
373void trace_latency_header(struct seq_file *m);
373void trace_default_header(struct seq_file *m); 374void trace_default_header(struct seq_file *m);
374void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 375void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
375int trace_empty(struct trace_iterator *iter); 376int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
654 TRACE_ITER_RECORD_CMD = 0x100000, 655 TRACE_ITER_RECORD_CMD = 0x100000,
655 TRACE_ITER_OVERWRITE = 0x200000, 656 TRACE_ITER_OVERWRITE = 0x200000,
656 TRACE_ITER_STOP_ON_FREE = 0x400000, 657 TRACE_ITER_STOP_ON_FREE = 0x400000,
658 TRACE_ITER_IRQ_INFO = 0x800000,
657}; 659};
658 660
659/* 661/*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 95dc31efd6dd..f04cc3136bd3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
30enum filter_op_ids 36enum filter_op_ids
31{ 37{
32 OP_OR, 38 OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
646 if (filter && filter->filter_string) 652 if (filter && filter->filter_string)
647 trace_seq_printf(s, "%s\n", filter->filter_string); 653 trace_seq_printf(s, "%s\n", filter->filter_string);
648 else 654 else
649 trace_seq_printf(s, "none\n"); 655 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
650 mutex_unlock(&event_mutex); 656 mutex_unlock(&event_mutex);
651} 657}
652 658
@@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1838 if (!filter) 1844 if (!filter)
1839 goto out; 1845 goto out;
1840 1846
1841 replace_filter_string(filter, filter_string); 1847 /* System filters just show a default message */
1848 kfree(filter->filter_string);
1849 filter->filter_string = NULL;
1850
1842 /* 1851 /*
1843 * No event actually uses the system filter 1852 * No event actually uses the system filter
1844 * we can free it without synchronize_sched(). 1853 * we can free it without synchronize_sched().
@@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1848 1857
1849 parse_init(ps, filter_ops, filter_string); 1858 parse_init(ps, filter_ops, filter_string);
1850 err = filter_parse(ps); 1859 err = filter_parse(ps);
1851 if (err) { 1860 if (err)
1852 append_filter_err(ps, system->filter); 1861 goto err_filter;
1853 goto out;
1854 }
1855 1862
1856 err = replace_system_preds(system, ps, filter_string); 1863 err = replace_system_preds(system, ps, filter_string);
1857 if (err) 1864 if (err)
1858 append_filter_err(ps, system->filter); 1865 goto err_filter;
1859 1866
1860out: 1867out:
1861 filter_opstack_clear(ps); 1868 filter_opstack_clear(ps);
@@ -1865,6 +1872,11 @@ out_unlock:
1865 mutex_unlock(&event_mutex); 1872 mutex_unlock(&event_mutex);
1866 1873
1867 return err; 1874 return err;
1875
1876err_filter:
1877 replace_filter_string(filter, filter_string);
1878 append_filter_err(ps, system->filter);
1879 goto out;
1868} 1880}
1869 1881
1870#ifdef CONFIG_PERF_EVENTS 1882#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a163..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 283static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 284static void irqsoff_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void irqsoff_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void irqsoff_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
627 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t; 628 unsigned long secs = (unsigned long)t;
629 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
630 int ret;
630 631
631 trace_find_cmdline(entry->pid, comm); 632 trace_find_cmdline(entry->pid, comm);
632 633
633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", 634 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
634 comm, entry->pid, iter->cpu, secs, usec_rem); 635 comm, entry->pid, iter->cpu);
636 if (!ret)
637 return 0;
638
639 if (trace_flags & TRACE_ITER_IRQ_INFO) {
640 ret = trace_print_lat_fmt(s, entry);
641 if (!ret)
642 return 0;
643 }
644
645 return trace_seq_printf(s, " %5lu.%06lu: ",
646 secs, usec_rem);
635} 647}
636 648
637int trace_print_lat_context(struct trace_iterator *iter) 649int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 283static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 284static void wakeup_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void wakeup_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void wakeup_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90f..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key); 16 lockdep_set_class_and_name(&q->lock, key, name);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19