diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 20 | ||||
-rw-r--r-- | kernel/acct.c | 4 | ||||
-rw-r--r-- | kernel/cpu.c | 4 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_support.c | 2 | ||||
-rw-r--r-- | kernel/events/Makefile | 2 | ||||
-rw-r--r-- | kernel/events/callchain.c | 191 | ||||
-rw-r--r-- | kernel/events/core.c | 298 | ||||
-rw-r--r-- | kernel/events/internal.h | 39 | ||||
-rw-r--r-- | kernel/exit.c | 22 | ||||
-rw-r--r-- | kernel/fork.c | 14 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 12 | ||||
-rw-r--r-- | kernel/itimer.c | 15 | ||||
-rw-r--r-- | kernel/jump_label.c | 49 | ||||
-rw-r--r-- | kernel/lockdep.c | 83 | ||||
-rw-r--r-- | kernel/panic.c | 17 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 132 | ||||
-rw-r--r-- | kernel/printk.c | 11 | ||||
-rw-r--r-- | kernel/rcu.h | 7 | ||||
-rw-r--r-- | kernel/rcupdate.c | 12 | ||||
-rw-r--r-- | kernel/rcutiny.c | 149 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 29 | ||||
-rw-r--r-- | kernel/rcutorture.c | 225 | ||||
-rw-r--r-- | kernel/rcutree.c | 290 | ||||
-rw-r--r-- | kernel/rcutree.h | 26 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 289 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 12 | ||||
-rw-r--r-- | kernel/rtmutex-debug.c | 1 | ||||
-rw-r--r-- | kernel/rtmutex.c | 8 | ||||
-rw-r--r-- | kernel/sched/Makefile | 20 | ||||
-rw-r--r-- | kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c) | 33 | ||||
-rw-r--r-- | kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h) | 26 | ||||
-rw-r--r-- | kernel/sched/clock.c (renamed from kernel/sched_clock.c) | 0 | ||||
-rw-r--r-- | kernel/sched/core.c (renamed from kernel/sched.c) | 2187 | ||||
-rw-r--r-- | kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c) | 4 | ||||
-rw-r--r-- | kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h) | 0 | ||||
-rw-r--r-- | kernel/sched/debug.c (renamed from kernel/sched_debug.c) | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c (renamed from kernel/sched_fair.c) | 1000 | ||||
-rw-r--r-- | kernel/sched/features.h (renamed from kernel/sched_features.h) | 30 | ||||
-rw-r--r-- | kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c) | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c (renamed from kernel/sched_rt.c) | 218 | ||||
-rw-r--r-- | kernel/sched/sched.h | 1166 | ||||
-rw-r--r-- | kernel/sched/stats.c | 111 | ||||
-rw-r--r-- | kernel/sched/stats.h (renamed from kernel/sched_stats.h) | 109 | ||||
-rw-r--r-- | kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c) | 4 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/softirq.c | 4 | ||||
-rw-r--r-- | kernel/sys.c | 6 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 105 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 10 | ||||
-rw-r--r-- | kernel/timer.c | 62 | ||||
-rw-r--r-- | kernel/trace/trace.c | 106 | ||||
-rw-r--r-- | kernel/trace/trace.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 26 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 16 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 13 | ||||
-rw-r--r-- | kernel/tsacct.c | 2 | ||||
-rw-r--r-- | kernel/wait.c | 4 |
58 files changed, 4206 insertions, 3050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b9d02c..f70396e5a24b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -2,16 +2,15 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o groups.o |
14 | obj-y += groups.o | ||
15 | 14 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
17 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg | |||
20 | CFLAGS_REMOVE_mutex-debug.o = -pg | 19 | CFLAGS_REMOVE_mutex-debug.o = -pg |
21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 20 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 21 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
23 | CFLAGS_REMOVE_sched_clock.o = -pg | ||
24 | CFLAGS_REMOVE_irq_work.o = -pg | 22 | CFLAGS_REMOVE_irq_work.o = -pg |
25 | endif | 23 | endif |
26 | 24 | ||
25 | obj-y += sched/ | ||
26 | |||
27 | obj-$(CONFIG_FREEZER) += freezer.o | 27 | obj-$(CONFIG_FREEZER) += freezer.o |
28 | obj-$(CONFIG_PROFILING) += profile.o | 28 | obj-$(CONFIG_PROFILING) += profile.o |
29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | 29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o |
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ | |||
99 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 100 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
102 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 102 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
104 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 103 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
105 | 104 | ||
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o | |||
110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
111 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
112 | 111 | ||
113 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
114 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
115 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
116 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
117 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
118 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
119 | CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer | ||
120 | endif | ||
121 | |||
122 | $(obj)/configs.o: $(obj)/config_data.h | 112 | $(obj)/configs.o: $(obj)/config_data.h |
123 | 113 | ||
124 | # config_data.h contains the same information as ikconfig.h but gzipped. | 114 | # config_data.h contains the same information as ikconfig.h but gzipped. |
diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2ddc..203dfead2e06 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead) | |||
613 | pacct->ac_flag |= ACORE; | 613 | pacct->ac_flag |= ACORE; |
614 | if (current->flags & PF_SIGNALED) | 614 | if (current->flags & PF_SIGNALED) |
615 | pacct->ac_flag |= AXSIG; | 615 | pacct->ac_flag |= AXSIG; |
616 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | 616 | pacct->ac_utime += current->utime; |
617 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | 617 | pacct->ac_stime += current->stime; |
618 | pacct->ac_minflt += current->min_flt; | 618 | pacct->ac_minflt += current->min_flt; |
619 | pacct->ac_majflt += current->maj_flt; | 619 | pacct->ac_majflt += current->maj_flt; |
620 | spin_unlock_irq(¤t->sighand->siglock); | 620 | spin_unlock_irq(¤t->sighand->siglock); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 563f13609470..5ca38d5d238a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu) | |||
178 | write_lock_irq(&tasklist_lock); | 178 | write_lock_irq(&tasklist_lock); |
179 | for_each_process(p) { | 179 | for_each_process(p) { |
180 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 180 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
181 | (!cputime_eq(p->utime, cputime_zero) || | 181 | (p->utime || p->stime)) |
182 | !cputime_eq(p->stime, cputime_zero))) | ||
183 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 182 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " |
184 | "(state = %ld, flags = %x)\n", | 183 | "(state = %ld, flags = %x)\n", |
185 | p->comm, task_pid_nr(p), cpu, | 184 | p->comm, task_pid_nr(p), cpu, |
@@ -380,6 +379,7 @@ out: | |||
380 | cpu_maps_update_done(); | 379 | cpu_maps_update_done(); |
381 | return err; | 380 | return err; |
382 | } | 381 | } |
382 | EXPORT_SYMBOL_GPL(cpu_up); | ||
383 | 383 | ||
384 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
385 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa86..7d6fb40d2188 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) | |||
636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : | 636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : |
637 | (p->exit_state & EXIT_DEAD) ? 'E' : | 637 | (p->exit_state & EXIT_DEAD) ? 'E' : |
638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; | 638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; |
639 | if (p->pid == 0) { | 639 | if (is_idle_task(p)) { |
640 | /* Idle task. Is it really idle, apart from the kdb | 640 | /* Idle task. Is it really idle, apart from the kdb |
641 | * interrupt? */ | 641 | * interrupt? */ |
642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { | 642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER | |||
2 | CFLAGS_REMOVE_core.o = -pg | 2 | CFLAGS_REMOVE_core.o = -pg |
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o ring_buffer.o | 5 | obj-y := core.o ring_buffer.o callchain.o |
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..057e24b665cf --- /dev/null +++ b/kernel/events/callchain.c | |||
@@ -0,0 +1,191 @@ | |||
1 | /* | ||
2 | * Performance events callchain code, extracted from core.c: | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
8 | * | ||
9 | * For licensing details see kernel-base/COPYING | ||
10 | */ | ||
11 | |||
12 | #include <linux/perf_event.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include "internal.h" | ||
15 | |||
16 | struct callchain_cpus_entries { | ||
17 | struct rcu_head rcu_head; | ||
18 | struct perf_callchain_entry *cpu_entries[0]; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
22 | static atomic_t nr_callchain_events; | ||
23 | static DEFINE_MUTEX(callchain_mutex); | ||
24 | static struct callchain_cpus_entries *callchain_cpus_entries; | ||
25 | |||
26 | |||
27 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
28 | struct pt_regs *regs) | ||
29 | { | ||
30 | } | ||
31 | |||
32 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
33 | struct pt_regs *regs) | ||
34 | { | ||
35 | } | ||
36 | |||
37 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
38 | { | ||
39 | struct callchain_cpus_entries *entries; | ||
40 | int cpu; | ||
41 | |||
42 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
43 | |||
44 | for_each_possible_cpu(cpu) | ||
45 | kfree(entries->cpu_entries[cpu]); | ||
46 | |||
47 | kfree(entries); | ||
48 | } | ||
49 | |||
50 | static void release_callchain_buffers(void) | ||
51 | { | ||
52 | struct callchain_cpus_entries *entries; | ||
53 | |||
54 | entries = callchain_cpus_entries; | ||
55 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
57 | } | ||
58 | |||
59 | static int alloc_callchain_buffers(void) | ||
60 | { | ||
61 | int cpu; | ||
62 | int size; | ||
63 | struct callchain_cpus_entries *entries; | ||
64 | |||
65 | /* | ||
66 | * We can't use the percpu allocation API for data that can be | ||
67 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
68 | * until that gets sorted out. | ||
69 | */ | ||
70 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
71 | |||
72 | entries = kzalloc(size, GFP_KERNEL); | ||
73 | if (!entries) | ||
74 | return -ENOMEM; | ||
75 | |||
76 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
77 | |||
78 | for_each_possible_cpu(cpu) { | ||
79 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
80 | cpu_to_node(cpu)); | ||
81 | if (!entries->cpu_entries[cpu]) | ||
82 | goto fail; | ||
83 | } | ||
84 | |||
85 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
86 | |||
87 | return 0; | ||
88 | |||
89 | fail: | ||
90 | for_each_possible_cpu(cpu) | ||
91 | kfree(entries->cpu_entries[cpu]); | ||
92 | kfree(entries); | ||
93 | |||
94 | return -ENOMEM; | ||
95 | } | ||
96 | |||
97 | int get_callchain_buffers(void) | ||
98 | { | ||
99 | int err = 0; | ||
100 | int count; | ||
101 | |||
102 | mutex_lock(&callchain_mutex); | ||
103 | |||
104 | count = atomic_inc_return(&nr_callchain_events); | ||
105 | if (WARN_ON_ONCE(count < 1)) { | ||
106 | err = -EINVAL; | ||
107 | goto exit; | ||
108 | } | ||
109 | |||
110 | if (count > 1) { | ||
111 | /* If the allocation failed, give up */ | ||
112 | if (!callchain_cpus_entries) | ||
113 | err = -ENOMEM; | ||
114 | goto exit; | ||
115 | } | ||
116 | |||
117 | err = alloc_callchain_buffers(); | ||
118 | if (err) | ||
119 | release_callchain_buffers(); | ||
120 | exit: | ||
121 | mutex_unlock(&callchain_mutex); | ||
122 | |||
123 | return err; | ||
124 | } | ||
125 | |||
126 | void put_callchain_buffers(void) | ||
127 | { | ||
128 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
129 | release_callchain_buffers(); | ||
130 | mutex_unlock(&callchain_mutex); | ||
131 | } | ||
132 | } | ||
133 | |||
134 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
135 | { | ||
136 | int cpu; | ||
137 | struct callchain_cpus_entries *entries; | ||
138 | |||
139 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
140 | if (*rctx == -1) | ||
141 | return NULL; | ||
142 | |||
143 | entries = rcu_dereference(callchain_cpus_entries); | ||
144 | if (!entries) | ||
145 | return NULL; | ||
146 | |||
147 | cpu = smp_processor_id(); | ||
148 | |||
149 | return &entries->cpu_entries[cpu][*rctx]; | ||
150 | } | ||
151 | |||
152 | static void | ||
153 | put_callchain_entry(int rctx) | ||
154 | { | ||
155 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
156 | } | ||
157 | |||
158 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
159 | { | ||
160 | int rctx; | ||
161 | struct perf_callchain_entry *entry; | ||
162 | |||
163 | |||
164 | entry = get_callchain_entry(&rctx); | ||
165 | if (rctx == -1) | ||
166 | return NULL; | ||
167 | |||
168 | if (!entry) | ||
169 | goto exit_put; | ||
170 | |||
171 | entry->nr = 0; | ||
172 | |||
173 | if (!user_mode(regs)) { | ||
174 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
175 | perf_callchain_kernel(entry, regs); | ||
176 | if (current->mm) | ||
177 | regs = task_pt_regs(current); | ||
178 | else | ||
179 | regs = NULL; | ||
180 | } | ||
181 | |||
182 | if (regs) { | ||
183 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
184 | perf_callchain_user(entry, regs); | ||
185 | } | ||
186 | |||
187 | exit_put: | ||
188 | put_callchain_entry(rctx); | ||
189 | |||
190 | return entry; | ||
191 | } | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 58690af323e4..890eb02c2f21 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -128,7 +128,7 @@ enum event_type_t { | |||
128 | * perf_sched_events : >0 events exist | 128 | * perf_sched_events : >0 events exist |
129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
130 | */ | 130 | */ |
131 | struct jump_label_key perf_sched_events __read_mostly; | 131 | struct jump_label_key_deferred perf_sched_events __read_mostly; |
132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
133 | 133 | ||
134 | static atomic_t nr_mmap_events __read_mostly; | 134 | static atomic_t nr_mmap_events __read_mostly; |
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event, | |||
1130 | if (!is_software_event(event)) | 1130 | if (!is_software_event(event)) |
1131 | cpuctx->active_oncpu--; | 1131 | cpuctx->active_oncpu--; |
1132 | ctx->nr_active--; | 1132 | ctx->nr_active--; |
1133 | if (event->attr.freq && event->attr.sample_freq) | ||
1134 | ctx->nr_freq--; | ||
1133 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1135 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
1134 | cpuctx->exclusive = 0; | 1136 | cpuctx->exclusive = 0; |
1135 | } | 1137 | } |
@@ -1325,6 +1327,7 @@ retry: | |||
1325 | } | 1327 | } |
1326 | raw_spin_unlock_irq(&ctx->lock); | 1328 | raw_spin_unlock_irq(&ctx->lock); |
1327 | } | 1329 | } |
1330 | EXPORT_SYMBOL_GPL(perf_event_disable); | ||
1328 | 1331 | ||
1329 | static void perf_set_shadow_time(struct perf_event *event, | 1332 | static void perf_set_shadow_time(struct perf_event *event, |
1330 | struct perf_event_context *ctx, | 1333 | struct perf_event_context *ctx, |
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event, | |||
1406 | if (!is_software_event(event)) | 1409 | if (!is_software_event(event)) |
1407 | cpuctx->active_oncpu++; | 1410 | cpuctx->active_oncpu++; |
1408 | ctx->nr_active++; | 1411 | ctx->nr_active++; |
1412 | if (event->attr.freq && event->attr.sample_freq) | ||
1413 | ctx->nr_freq++; | ||
1409 | 1414 | ||
1410 | if (event->attr.exclusive) | 1415 | if (event->attr.exclusive) |
1411 | cpuctx->exclusive = 1; | 1416 | cpuctx->exclusive = 1; |
@@ -1662,8 +1667,7 @@ retry: | |||
1662 | * Note: this works for group members as well as group leaders | 1667 | * Note: this works for group members as well as group leaders |
1663 | * since the non-leader members' sibling_lists will be empty. | 1668 | * since the non-leader members' sibling_lists will be empty. |
1664 | */ | 1669 | */ |
1665 | static void __perf_event_mark_enabled(struct perf_event *event, | 1670 | static void __perf_event_mark_enabled(struct perf_event *event) |
1666 | struct perf_event_context *ctx) | ||
1667 | { | 1671 | { |
1668 | struct perf_event *sub; | 1672 | struct perf_event *sub; |
1669 | u64 tstamp = perf_event_time(event); | 1673 | u64 tstamp = perf_event_time(event); |
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info) | |||
1701 | */ | 1705 | */ |
1702 | perf_cgroup_set_timestamp(current, ctx); | 1706 | perf_cgroup_set_timestamp(current, ctx); |
1703 | 1707 | ||
1704 | __perf_event_mark_enabled(event, ctx); | 1708 | __perf_event_mark_enabled(event); |
1705 | 1709 | ||
1706 | if (!event_filter_match(event)) { | 1710 | if (!event_filter_match(event)) { |
1707 | if (is_cgroup_event(event)) | 1711 | if (is_cgroup_event(event)) |
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) | |||
1782 | 1786 | ||
1783 | retry: | 1787 | retry: |
1784 | if (!ctx->is_active) { | 1788 | if (!ctx->is_active) { |
1785 | __perf_event_mark_enabled(event, ctx); | 1789 | __perf_event_mark_enabled(event); |
1786 | goto out; | 1790 | goto out; |
1787 | } | 1791 | } |
1788 | 1792 | ||
@@ -1809,6 +1813,7 @@ retry: | |||
1809 | out: | 1813 | out: |
1810 | raw_spin_unlock_irq(&ctx->lock); | 1814 | raw_spin_unlock_irq(&ctx->lock); |
1811 | } | 1815 | } |
1816 | EXPORT_SYMBOL_GPL(perf_event_enable); | ||
1812 | 1817 | ||
1813 | int perf_event_refresh(struct perf_event *event, int refresh) | 1818 | int perf_event_refresh(struct perf_event *event, int refresh) |
1814 | { | 1819 | { |
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2327 | u64 interrupts, now; | 2332 | u64 interrupts, now; |
2328 | s64 delta; | 2333 | s64 delta; |
2329 | 2334 | ||
2335 | if (!ctx->nr_freq) | ||
2336 | return; | ||
2337 | |||
2330 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 2338 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
2331 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2339 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2332 | continue; | 2340 | continue; |
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2382 | { | 2390 | { |
2383 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; | 2391 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
2384 | struct perf_event_context *ctx = NULL; | 2392 | struct perf_event_context *ctx = NULL; |
2385 | int rotate = 0, remove = 1; | 2393 | int rotate = 0, remove = 1, freq = 0; |
2386 | 2394 | ||
2387 | if (cpuctx->ctx.nr_events) { | 2395 | if (cpuctx->ctx.nr_events) { |
2388 | remove = 0; | 2396 | remove = 0; |
2389 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 2397 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
2390 | rotate = 1; | 2398 | rotate = 1; |
2399 | if (cpuctx->ctx.nr_freq) | ||
2400 | freq = 1; | ||
2391 | } | 2401 | } |
2392 | 2402 | ||
2393 | ctx = cpuctx->task_ctx; | 2403 | ctx = cpuctx->task_ctx; |
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2395 | remove = 0; | 2405 | remove = 0; |
2396 | if (ctx->nr_events != ctx->nr_active) | 2406 | if (ctx->nr_events != ctx->nr_active) |
2397 | rotate = 1; | 2407 | rotate = 1; |
2408 | if (ctx->nr_freq) | ||
2409 | freq = 1; | ||
2398 | } | 2410 | } |
2399 | 2411 | ||
2412 | if (!rotate && !freq) | ||
2413 | goto done; | ||
2414 | |||
2400 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | 2415 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
2401 | perf_pmu_disable(cpuctx->ctx.pmu); | 2416 | perf_pmu_disable(cpuctx->ctx.pmu); |
2402 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
2403 | if (ctx) | ||
2404 | perf_ctx_adjust_freq(ctx, interval); | ||
2405 | 2417 | ||
2406 | if (!rotate) | 2418 | if (freq) { |
2407 | goto done; | 2419 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); |
2420 | if (ctx) | ||
2421 | perf_ctx_adjust_freq(ctx, interval); | ||
2422 | } | ||
2408 | 2423 | ||
2409 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2424 | if (rotate) { |
2410 | if (ctx) | 2425 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2411 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 2426 | if (ctx) |
2427 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | ||
2412 | 2428 | ||
2413 | rotate_ctx(&cpuctx->ctx); | 2429 | rotate_ctx(&cpuctx->ctx); |
2414 | if (ctx) | 2430 | if (ctx) |
2415 | rotate_ctx(ctx); | 2431 | rotate_ctx(ctx); |
2416 | 2432 | ||
2417 | perf_event_sched_in(cpuctx, ctx, current); | 2433 | perf_event_sched_in(cpuctx, ctx, current); |
2434 | } | ||
2435 | |||
2436 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2437 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2418 | 2438 | ||
2419 | done: | 2439 | done: |
2420 | if (remove) | 2440 | if (remove) |
2421 | list_del_init(&cpuctx->rotation_list); | 2441 | list_del_init(&cpuctx->rotation_list); |
2422 | |||
2423 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2424 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2425 | } | 2442 | } |
2426 | 2443 | ||
2427 | void perf_event_task_tick(void) | 2444 | void perf_event_task_tick(void) |
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event, | |||
2448 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 2465 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
2449 | return 0; | 2466 | return 0; |
2450 | 2467 | ||
2451 | __perf_event_mark_enabled(event, ctx); | 2468 | __perf_event_mark_enabled(event); |
2452 | 2469 | ||
2453 | return 1; | 2470 | return 1; |
2454 | } | 2471 | } |
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2480 | raw_spin_lock(&ctx->lock); | 2497 | raw_spin_lock(&ctx->lock); |
2481 | task_ctx_sched_out(ctx); | 2498 | task_ctx_sched_out(ctx); |
2482 | 2499 | ||
2483 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 2500 | list_for_each_entry(event, &ctx->event_list, event_entry) { |
2484 | ret = event_enable_on_exec(event, ctx); | ||
2485 | if (ret) | ||
2486 | enabled = 1; | ||
2487 | } | ||
2488 | |||
2489 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | ||
2490 | ret = event_enable_on_exec(event, ctx); | 2501 | ret = event_enable_on_exec(event, ctx); |
2491 | if (ret) | 2502 | if (ret) |
2492 | enabled = 1; | 2503 | enabled = 1; |
@@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event) | |||
2574 | } | 2585 | } |
2575 | 2586 | ||
2576 | /* | 2587 | /* |
2577 | * Callchain support | ||
2578 | */ | ||
2579 | |||
2580 | struct callchain_cpus_entries { | ||
2581 | struct rcu_head rcu_head; | ||
2582 | struct perf_callchain_entry *cpu_entries[0]; | ||
2583 | }; | ||
2584 | |||
2585 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
2586 | static atomic_t nr_callchain_events; | ||
2587 | static DEFINE_MUTEX(callchain_mutex); | ||
2588 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
2589 | |||
2590 | |||
2591 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
2592 | struct pt_regs *regs) | ||
2593 | { | ||
2594 | } | ||
2595 | |||
2596 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
2597 | struct pt_regs *regs) | ||
2598 | { | ||
2599 | } | ||
2600 | |||
2601 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
2602 | { | ||
2603 | struct callchain_cpus_entries *entries; | ||
2604 | int cpu; | ||
2605 | |||
2606 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
2607 | |||
2608 | for_each_possible_cpu(cpu) | ||
2609 | kfree(entries->cpu_entries[cpu]); | ||
2610 | |||
2611 | kfree(entries); | ||
2612 | } | ||
2613 | |||
2614 | static void release_callchain_buffers(void) | ||
2615 | { | ||
2616 | struct callchain_cpus_entries *entries; | ||
2617 | |||
2618 | entries = callchain_cpus_entries; | ||
2619 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
2620 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
2621 | } | ||
2622 | |||
2623 | static int alloc_callchain_buffers(void) | ||
2624 | { | ||
2625 | int cpu; | ||
2626 | int size; | ||
2627 | struct callchain_cpus_entries *entries; | ||
2628 | |||
2629 | /* | ||
2630 | * We can't use the percpu allocation API for data that can be | ||
2631 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
2632 | * until that gets sorted out. | ||
2633 | */ | ||
2634 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
2635 | |||
2636 | entries = kzalloc(size, GFP_KERNEL); | ||
2637 | if (!entries) | ||
2638 | return -ENOMEM; | ||
2639 | |||
2640 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
2641 | |||
2642 | for_each_possible_cpu(cpu) { | ||
2643 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
2644 | cpu_to_node(cpu)); | ||
2645 | if (!entries->cpu_entries[cpu]) | ||
2646 | goto fail; | ||
2647 | } | ||
2648 | |||
2649 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
2650 | |||
2651 | return 0; | ||
2652 | |||
2653 | fail: | ||
2654 | for_each_possible_cpu(cpu) | ||
2655 | kfree(entries->cpu_entries[cpu]); | ||
2656 | kfree(entries); | ||
2657 | |||
2658 | return -ENOMEM; | ||
2659 | } | ||
2660 | |||
2661 | static int get_callchain_buffers(void) | ||
2662 | { | ||
2663 | int err = 0; | ||
2664 | int count; | ||
2665 | |||
2666 | mutex_lock(&callchain_mutex); | ||
2667 | |||
2668 | count = atomic_inc_return(&nr_callchain_events); | ||
2669 | if (WARN_ON_ONCE(count < 1)) { | ||
2670 | err = -EINVAL; | ||
2671 | goto exit; | ||
2672 | } | ||
2673 | |||
2674 | if (count > 1) { | ||
2675 | /* If the allocation failed, give up */ | ||
2676 | if (!callchain_cpus_entries) | ||
2677 | err = -ENOMEM; | ||
2678 | goto exit; | ||
2679 | } | ||
2680 | |||
2681 | err = alloc_callchain_buffers(); | ||
2682 | if (err) | ||
2683 | release_callchain_buffers(); | ||
2684 | exit: | ||
2685 | mutex_unlock(&callchain_mutex); | ||
2686 | |||
2687 | return err; | ||
2688 | } | ||
2689 | |||
2690 | static void put_callchain_buffers(void) | ||
2691 | { | ||
2692 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
2693 | release_callchain_buffers(); | ||
2694 | mutex_unlock(&callchain_mutex); | ||
2695 | } | ||
2696 | } | ||
2697 | |||
2698 | static int get_recursion_context(int *recursion) | ||
2699 | { | ||
2700 | int rctx; | ||
2701 | |||
2702 | if (in_nmi()) | ||
2703 | rctx = 3; | ||
2704 | else if (in_irq()) | ||
2705 | rctx = 2; | ||
2706 | else if (in_softirq()) | ||
2707 | rctx = 1; | ||
2708 | else | ||
2709 | rctx = 0; | ||
2710 | |||
2711 | if (recursion[rctx]) | ||
2712 | return -1; | ||
2713 | |||
2714 | recursion[rctx]++; | ||
2715 | barrier(); | ||
2716 | |||
2717 | return rctx; | ||
2718 | } | ||
2719 | |||
2720 | static inline void put_recursion_context(int *recursion, int rctx) | ||
2721 | { | ||
2722 | barrier(); | ||
2723 | recursion[rctx]--; | ||
2724 | } | ||
2725 | |||
2726 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
2727 | { | ||
2728 | int cpu; | ||
2729 | struct callchain_cpus_entries *entries; | ||
2730 | |||
2731 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
2732 | if (*rctx == -1) | ||
2733 | return NULL; | ||
2734 | |||
2735 | entries = rcu_dereference(callchain_cpus_entries); | ||
2736 | if (!entries) | ||
2737 | return NULL; | ||
2738 | |||
2739 | cpu = smp_processor_id(); | ||
2740 | |||
2741 | return &entries->cpu_entries[cpu][*rctx]; | ||
2742 | } | ||
2743 | |||
2744 | static void | ||
2745 | put_callchain_entry(int rctx) | ||
2746 | { | ||
2747 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
2748 | } | ||
2749 | |||
2750 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2751 | { | ||
2752 | int rctx; | ||
2753 | struct perf_callchain_entry *entry; | ||
2754 | |||
2755 | |||
2756 | entry = get_callchain_entry(&rctx); | ||
2757 | if (rctx == -1) | ||
2758 | return NULL; | ||
2759 | |||
2760 | if (!entry) | ||
2761 | goto exit_put; | ||
2762 | |||
2763 | entry->nr = 0; | ||
2764 | |||
2765 | if (!user_mode(regs)) { | ||
2766 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2767 | perf_callchain_kernel(entry, regs); | ||
2768 | if (current->mm) | ||
2769 | regs = task_pt_regs(current); | ||
2770 | else | ||
2771 | regs = NULL; | ||
2772 | } | ||
2773 | |||
2774 | if (regs) { | ||
2775 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2776 | perf_callchain_user(entry, regs); | ||
2777 | } | ||
2778 | |||
2779 | exit_put: | ||
2780 | put_callchain_entry(rctx); | ||
2781 | |||
2782 | return entry; | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * Initialize the perf_event context in a task_struct: | 2588 | * Initialize the perf_event context in a task_struct: |
2787 | */ | 2589 | */ |
2788 | static void __perf_event_init_context(struct perf_event_context *ctx) | 2590 | static void __perf_event_init_context(struct perf_event_context *ctx) |
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event) | |||
2946 | 2748 | ||
2947 | if (!event->parent) { | 2749 | if (!event->parent) { |
2948 | if (event->attach_state & PERF_ATTACH_TASK) | 2750 | if (event->attach_state & PERF_ATTACH_TASK) |
2949 | jump_label_dec(&perf_sched_events); | 2751 | jump_label_dec_deferred(&perf_sched_events); |
2950 | if (event->attr.mmap || event->attr.mmap_data) | 2752 | if (event->attr.mmap || event->attr.mmap_data) |
2951 | atomic_dec(&nr_mmap_events); | 2753 | atomic_dec(&nr_mmap_events); |
2952 | if (event->attr.comm) | 2754 | if (event->attr.comm) |
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event) | |||
2957 | put_callchain_buffers(); | 2759 | put_callchain_buffers(); |
2958 | if (is_cgroup_event(event)) { | 2760 | if (is_cgroup_event(event)) { |
2959 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | 2761 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); |
2960 | jump_label_dec(&perf_sched_events); | 2762 | jump_label_dec_deferred(&perf_sched_events); |
2961 | } | 2763 | } |
2962 | } | 2764 | } |
2963 | 2765 | ||
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4820 | struct hw_perf_event *hwc = &event->hw; | 4622 | struct hw_perf_event *hwc = &event->hw; |
4821 | int throttle = 0; | 4623 | int throttle = 0; |
4822 | 4624 | ||
4823 | data->period = event->hw.last_period; | ||
4824 | if (!overflow) | 4625 | if (!overflow) |
4825 | overflow = perf_swevent_set_period(event); | 4626 | overflow = perf_swevent_set_period(event); |
4826 | 4627 | ||
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4854 | if (!is_sampling_event(event)) | 4655 | if (!is_sampling_event(event)) |
4855 | return; | 4656 | return; |
4856 | 4657 | ||
4658 | if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { | ||
4659 | data->period = nr; | ||
4660 | return perf_swevent_overflow(event, 1, data, regs); | ||
4661 | } else | ||
4662 | data->period = event->hw.last_period; | ||
4663 | |||
4857 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4664 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4858 | return perf_swevent_overflow(event, 1, data, regs); | 4665 | return perf_swevent_overflow(event, 1, data, regs); |
4859 | 4666 | ||
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5366 | regs = get_irq_regs(); | 5173 | regs = get_irq_regs(); |
5367 | 5174 | ||
5368 | if (regs && !perf_exclude_event(event, regs)) { | 5175 | if (regs && !perf_exclude_event(event, regs)) { |
5369 | if (!(event->attr.exclude_idle && current->pid == 0)) | 5176 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5370 | if (perf_event_overflow(event, &data, regs)) | 5177 | if (perf_event_overflow(event, &data, regs)) |
5371 | ret = HRTIMER_NORESTART; | 5178 | ret = HRTIMER_NORESTART; |
5372 | } | 5179 | } |
@@ -5981,7 +5788,7 @@ done: | |||
5981 | 5788 | ||
5982 | if (!event->parent) { | 5789 | if (!event->parent) { |
5983 | if (event->attach_state & PERF_ATTACH_TASK) | 5790 | if (event->attach_state & PERF_ATTACH_TASK) |
5984 | jump_label_inc(&perf_sched_events); | 5791 | jump_label_inc(&perf_sched_events.key); |
5985 | if (event->attr.mmap || event->attr.mmap_data) | 5792 | if (event->attr.mmap || event->attr.mmap_data) |
5986 | atomic_inc(&nr_mmap_events); | 5793 | atomic_inc(&nr_mmap_events); |
5987 | if (event->attr.comm) | 5794 | if (event->attr.comm) |
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6219 | * - that may need work on context switch | 6026 | * - that may need work on context switch |
6220 | */ | 6027 | */ |
6221 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | 6028 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); |
6222 | jump_label_inc(&perf_sched_events); | 6029 | jump_label_inc(&perf_sched_events.key); |
6223 | } | 6030 | } |
6224 | 6031 | ||
6225 | /* | 6032 | /* |
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void) | |||
7065 | 6872 | ||
7066 | ret = init_hw_breakpoint(); | 6873 | ret = init_hw_breakpoint(); |
7067 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 6874 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
6875 | |||
6876 | /* do not patch jump label more than once per second */ | ||
6877 | jump_label_rate_limit(&perf_sched_events, HZ); | ||
7068 | } | 6878 | } |
7069 | 6879 | ||
7070 | static int __init perf_event_sysfs_init(void) | 6880 | static int __init perf_event_sysfs_init(void) |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 64568a699375..b0b107f90afc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -1,6 +1,10 @@ | |||
1 | #ifndef _KERNEL_EVENTS_INTERNAL_H | 1 | #ifndef _KERNEL_EVENTS_INTERNAL_H |
2 | #define _KERNEL_EVENTS_INTERNAL_H | 2 | #define _KERNEL_EVENTS_INTERNAL_H |
3 | 3 | ||
4 | #include <linux/hardirq.h> | ||
5 | |||
6 | /* Buffer handling */ | ||
7 | |||
4 | #define RING_BUFFER_WRITABLE 0x01 | 8 | #define RING_BUFFER_WRITABLE 0x01 |
5 | 9 | ||
6 | struct ring_buffer { | 10 | struct ring_buffer { |
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb) | |||
67 | } | 71 | } |
68 | #endif | 72 | #endif |
69 | 73 | ||
70 | static unsigned long perf_data_size(struct ring_buffer *rb) | 74 | static inline unsigned long perf_data_size(struct ring_buffer *rb) |
71 | { | 75 | { |
72 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 76 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
73 | } | 77 | } |
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle, | |||
96 | } while (len); | 100 | } while (len); |
97 | } | 101 | } |
98 | 102 | ||
103 | /* Callchain handling */ | ||
104 | extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); | ||
105 | extern int get_callchain_buffers(void); | ||
106 | extern void put_callchain_buffers(void); | ||
107 | |||
108 | static inline int get_recursion_context(int *recursion) | ||
109 | { | ||
110 | int rctx; | ||
111 | |||
112 | if (in_nmi()) | ||
113 | rctx = 3; | ||
114 | else if (in_irq()) | ||
115 | rctx = 2; | ||
116 | else if (in_softirq()) | ||
117 | rctx = 1; | ||
118 | else | ||
119 | rctx = 0; | ||
120 | |||
121 | if (recursion[rctx]) | ||
122 | return -1; | ||
123 | |||
124 | recursion[rctx]++; | ||
125 | barrier(); | ||
126 | |||
127 | return rctx; | ||
128 | } | ||
129 | |||
130 | static inline void put_recursion_context(int *recursion, int rctx) | ||
131 | { | ||
132 | barrier(); | ||
133 | recursion[rctx]--; | ||
134 | } | ||
135 | |||
99 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | 136 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ |
diff --git a/kernel/exit.c b/kernel/exit.c index e6e01b959a0e..d579a459309d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
121 | * We won't ever get here for the group leader, since it | 121 | * We won't ever get here for the group leader, since it |
122 | * will have been the last reference on the signal_struct. | 122 | * will have been the last reference on the signal_struct. |
123 | */ | 123 | */ |
124 | sig->utime = cputime_add(sig->utime, tsk->utime); | 124 | sig->utime += tsk->utime; |
125 | sig->stime = cputime_add(sig->stime, tsk->stime); | 125 | sig->stime += tsk->stime; |
126 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 126 | sig->gtime += tsk->gtime; |
127 | sig->min_flt += tsk->min_flt; | 127 | sig->min_flt += tsk->min_flt; |
128 | sig->maj_flt += tsk->maj_flt; | 128 | sig->maj_flt += tsk->maj_flt; |
129 | sig->nvcsw += tsk->nvcsw; | 129 | sig->nvcsw += tsk->nvcsw; |
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1255 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1255 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1256 | psig = p->real_parent->signal; | 1256 | psig = p->real_parent->signal; |
1257 | sig = p->signal; | 1257 | sig = p->signal; |
1258 | psig->cutime = | 1258 | psig->cutime += tgutime + sig->cutime; |
1259 | cputime_add(psig->cutime, | 1259 | psig->cstime += tgstime + sig->cstime; |
1260 | cputime_add(tgutime, | 1260 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; |
1261 | sig->cutime)); | ||
1262 | psig->cstime = | ||
1263 | cputime_add(psig->cstime, | ||
1264 | cputime_add(tgstime, | ||
1265 | sig->cstime)); | ||
1266 | psig->cgtime = | ||
1267 | cputime_add(psig->cgtime, | ||
1268 | cputime_add(p->gtime, | ||
1269 | cputime_add(sig->gtime, | ||
1270 | sig->cgtime))); | ||
1271 | psig->cmin_flt += | 1261 | psig->cmin_flt += |
1272 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1262 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1273 | psig->cmaj_flt += | 1263 | psig->cmaj_flt += |
diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..b058c5820ecd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | |||
1023 | */ | 1023 | */ |
1024 | static void posix_cpu_timers_init(struct task_struct *tsk) | 1024 | static void posix_cpu_timers_init(struct task_struct *tsk) |
1025 | { | 1025 | { |
1026 | tsk->cputime_expires.prof_exp = cputime_zero; | 1026 | tsk->cputime_expires.prof_exp = 0; |
1027 | tsk->cputime_expires.virt_exp = cputime_zero; | 1027 | tsk->cputime_expires.virt_exp = 0; |
1028 | tsk->cputime_expires.sched_exp = 0; | 1028 | tsk->cputime_expires.sched_exp = 0; |
1029 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); | 1029 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
1030 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); | 1030 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1132 | 1132 | ||
1133 | init_sigpending(&p->pending); | 1133 | init_sigpending(&p->pending); |
1134 | 1134 | ||
1135 | p->utime = cputime_zero; | 1135 | p->utime = p->stime = p->gtime = 0; |
1136 | p->stime = cputime_zero; | 1136 | p->utimescaled = p->stimescaled = 0; |
1137 | p->gtime = cputime_zero; | ||
1138 | p->utimescaled = cputime_zero; | ||
1139 | p->stimescaled = cputime_zero; | ||
1140 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1137 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1141 | p->prev_utime = cputime_zero; | 1138 | p->prev_utime = p->prev_stime = 0; |
1142 | p->prev_stime = cputime_zero; | ||
1143 | #endif | 1139 | #endif |
1144 | #if defined(SPLIT_RSS_COUNTING) | 1140 | #if defined(SPLIT_RSS_COUNTING) |
1145 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1141 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 200ce832c585..7ca523b249ef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, | |||
143 | return 0; | 143 | return 0; |
144 | } | 144 | } |
145 | 145 | ||
146 | struct irq_domain_ops irq_domain_simple_ops = { | ||
147 | .dt_translate = irq_domain_simple_dt_translate, | ||
148 | }; | ||
149 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
150 | |||
151 | /** | 146 | /** |
152 | * irq_domain_create_simple() - Set up a 'simple' translation range | 147 | * irq_domain_create_simple() - Set up a 'simple' translation range |
153 | */ | 148 | */ |
@@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, | |||
182 | } | 177 | } |
183 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | 178 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); |
184 | #endif /* CONFIG_OF_IRQ */ | 179 | #endif /* CONFIG_OF_IRQ */ |
180 | |||
181 | struct irq_domain_ops irq_domain_simple_ops = { | ||
182 | #ifdef CONFIG_OF_IRQ | ||
183 | .dt_translate = irq_domain_simple_dt_translate, | ||
184 | #endif /* CONFIG_OF_IRQ */ | ||
185 | }; | ||
186 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153da..22000c3db0dd 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
52 | 52 | ||
53 | cval = it->expires; | 53 | cval = it->expires; |
54 | cinterval = it->incr; | 54 | cinterval = it->incr; |
55 | if (!cputime_eq(cval, cputime_zero)) { | 55 | if (cval) { |
56 | struct task_cputime cputime; | 56 | struct task_cputime cputime; |
57 | cputime_t t; | 57 | cputime_t t; |
58 | 58 | ||
59 | thread_group_cputimer(tsk, &cputime); | 59 | thread_group_cputimer(tsk, &cputime); |
60 | if (clock_id == CPUCLOCK_PROF) | 60 | if (clock_id == CPUCLOCK_PROF) |
61 | t = cputime_add(cputime.utime, cputime.stime); | 61 | t = cputime.utime + cputime.stime; |
62 | else | 62 | else |
63 | /* CPUCLOCK_VIRT */ | 63 | /* CPUCLOCK_VIRT */ |
64 | t = cputime.utime; | 64 | t = cputime.utime; |
65 | 65 | ||
66 | if (cputime_le(cval, t)) | 66 | if (cval < t) |
67 | /* about to fire */ | 67 | /* about to fire */ |
68 | cval = cputime_one_jiffy; | 68 | cval = cputime_one_jiffy; |
69 | else | 69 | else |
70 | cval = cputime_sub(cval, t); | 70 | cval = cval - t; |
71 | } | 71 | } |
72 | 72 | ||
73 | spin_unlock_irq(&tsk->sighand->siglock); | 73 | spin_unlock_irq(&tsk->sighand->siglock); |
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
161 | 161 | ||
162 | cval = it->expires; | 162 | cval = it->expires; |
163 | cinterval = it->incr; | 163 | cinterval = it->incr; |
164 | if (!cputime_eq(cval, cputime_zero) || | 164 | if (cval || nval) { |
165 | !cputime_eq(nval, cputime_zero)) { | 165 | if (nval > 0) |
166 | if (cputime_gt(nval, cputime_zero)) | 166 | nval += cputime_one_jiffy; |
167 | nval = cputime_add(nval, cputime_one_jiffy); | ||
168 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); | 167 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); |
169 | } | 168 | } |
170 | it->expires = nval; | 169 | it->expires = nval; |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 66ff7109f697..30c3c7708132 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key) | |||
72 | jump_label_unlock(); | 72 | jump_label_unlock(); |
73 | } | 73 | } |
74 | 74 | ||
75 | void jump_label_dec(struct jump_label_key *key) | 75 | static void __jump_label_dec(struct jump_label_key *key, |
76 | unsigned long rate_limit, struct delayed_work *work) | ||
76 | { | 77 | { |
77 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) | 78 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
78 | return; | 79 | return; |
79 | 80 | ||
80 | jump_label_update(key, JUMP_LABEL_DISABLE); | 81 | if (rate_limit) { |
82 | atomic_inc(&key->enabled); | ||
83 | schedule_delayed_work(work, rate_limit); | ||
84 | } else | ||
85 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
86 | |||
81 | jump_label_unlock(); | 87 | jump_label_unlock(); |
82 | } | 88 | } |
83 | 89 | ||
90 | static void jump_label_update_timeout(struct work_struct *work) | ||
91 | { | ||
92 | struct jump_label_key_deferred *key = | ||
93 | container_of(work, struct jump_label_key_deferred, work.work); | ||
94 | __jump_label_dec(&key->key, 0, NULL); | ||
95 | } | ||
96 | |||
97 | void jump_label_dec(struct jump_label_key *key) | ||
98 | { | ||
99 | __jump_label_dec(key, 0, NULL); | ||
100 | } | ||
101 | |||
102 | void jump_label_dec_deferred(struct jump_label_key_deferred *key) | ||
103 | { | ||
104 | __jump_label_dec(&key->key, key->timeout, &key->work); | ||
105 | } | ||
106 | |||
107 | |||
108 | void jump_label_rate_limit(struct jump_label_key_deferred *key, | ||
109 | unsigned long rl) | ||
110 | { | ||
111 | key->timeout = rl; | ||
112 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | ||
113 | } | ||
114 | |||
84 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 115 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
85 | { | 116 | { |
86 | if (entry->code <= (unsigned long)end && | 117 | if (entry->code <= (unsigned long)end && |
@@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, | |||
111 | * running code can override this to make the non-live update case | 142 | * running code can override this to make the non-live update case |
112 | * cheaper. | 143 | * cheaper. |
113 | */ | 144 | */ |
114 | void __weak arch_jump_label_transform_static(struct jump_entry *entry, | 145 | void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, |
115 | enum jump_label_type type) | 146 | enum jump_label_type type) |
116 | { | 147 | { |
117 | arch_jump_label_transform(entry, type); | 148 | arch_jump_label_transform(entry, type); |
@@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod) | |||
217 | if (iter_start == iter_stop) | 248 | if (iter_start == iter_stop) |
218 | return; | 249 | return; |
219 | 250 | ||
220 | for (iter = iter_start; iter < iter_stop; iter++) | 251 | for (iter = iter_start; iter < iter_stop; iter++) { |
221 | arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); | 252 | struct jump_label_key *iterk; |
253 | |||
254 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | ||
255 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | ||
256 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
257 | } | ||
222 | } | 258 | } |
223 | 259 | ||
224 | static int jump_label_add_module(struct module *mod) | 260 | static int jump_label_add_module(struct module *mod) |
@@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod) | |||
258 | key->next = jlm; | 294 | key->next = jlm; |
259 | 295 | ||
260 | if (jump_label_enabled(key)) | 296 | if (jump_label_enabled(key)) |
261 | __jump_label_update(key, iter, iter_stop, | 297 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); |
262 | JUMP_LABEL_ENABLE); | ||
263 | } | 298 | } |
264 | 299 | ||
265 | return 0; | 300 | return 0; |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b2e08c932d91..8889f7dd7c46 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth; | |||
431 | * about it later on, in lockdep_info(). | 431 | * about it later on, in lockdep_info(). |
432 | */ | 432 | */ |
433 | static int lockdep_init_error; | 433 | static int lockdep_init_error; |
434 | static const char *lock_init_error; | ||
434 | static unsigned long lockdep_init_trace_data[20]; | 435 | static unsigned long lockdep_init_trace_data[20]; |
435 | static struct stack_trace lockdep_init_trace = { | 436 | static struct stack_trace lockdep_init_trace = { |
436 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), | 437 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), |
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
499 | usage[i] = '\0'; | 500 | usage[i] = '\0'; |
500 | } | 501 | } |
501 | 502 | ||
502 | static int __print_lock_name(struct lock_class *class) | 503 | static void __print_lock_name(struct lock_class *class) |
503 | { | 504 | { |
504 | char str[KSYM_NAME_LEN]; | 505 | char str[KSYM_NAME_LEN]; |
505 | const char *name; | 506 | const char *name; |
506 | 507 | ||
507 | name = class->name; | 508 | name = class->name; |
508 | if (!name) | ||
509 | name = __get_key_name(class->key, str); | ||
510 | |||
511 | return printk("%s", name); | ||
512 | } | ||
513 | |||
514 | static void print_lock_name(struct lock_class *class) | ||
515 | { | ||
516 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | ||
517 | const char *name; | ||
518 | |||
519 | get_usage_chars(class, usage); | ||
520 | |||
521 | name = class->name; | ||
522 | if (!name) { | 509 | if (!name) { |
523 | name = __get_key_name(class->key, str); | 510 | name = __get_key_name(class->key, str); |
524 | printk(" (%s", name); | 511 | printk("%s", name); |
525 | } else { | 512 | } else { |
526 | printk(" (%s", name); | 513 | printk("%s", name); |
527 | if (class->name_version > 1) | 514 | if (class->name_version > 1) |
528 | printk("#%d", class->name_version); | 515 | printk("#%d", class->name_version); |
529 | if (class->subclass) | 516 | if (class->subclass) |
530 | printk("/%d", class->subclass); | 517 | printk("/%d", class->subclass); |
531 | } | 518 | } |
519 | } | ||
520 | |||
521 | static void print_lock_name(struct lock_class *class) | ||
522 | { | ||
523 | char usage[LOCK_USAGE_CHARS]; | ||
524 | |||
525 | get_usage_chars(class, usage); | ||
526 | |||
527 | printk(" ("); | ||
528 | __print_lock_name(class); | ||
532 | printk("){%s}", usage); | 529 | printk("){%s}", usage); |
533 | } | 530 | } |
534 | 531 | ||
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) | |||
568 | } | 565 | } |
569 | } | 566 | } |
570 | 567 | ||
571 | static void print_kernel_version(void) | 568 | static void print_kernel_ident(void) |
572 | { | 569 | { |
573 | printk("%s %.*s\n", init_utsname()->release, | 570 | printk("%s %.*s %s\n", init_utsname()->release, |
574 | (int)strcspn(init_utsname()->version, " "), | 571 | (int)strcspn(init_utsname()->version, " "), |
575 | init_utsname()->version); | 572 | init_utsname()->version, |
573 | print_tainted()); | ||
576 | } | 574 | } |
577 | 575 | ||
578 | static int very_verbose(struct lock_class *class) | 576 | static int very_verbose(struct lock_class *class) |
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
656 | if (unlikely(!lockdep_initialized)) { | 654 | if (unlikely(!lockdep_initialized)) { |
657 | lockdep_init(); | 655 | lockdep_init(); |
658 | lockdep_init_error = 1; | 656 | lockdep_init_error = 1; |
657 | lock_init_error = lock->name; | ||
659 | save_stack_trace(&lockdep_init_trace); | 658 | save_stack_trace(&lockdep_init_trace); |
660 | } | 659 | } |
661 | #endif | 660 | #endif |
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
723 | 722 | ||
724 | class = look_up_lock_class(lock, subclass); | 723 | class = look_up_lock_class(lock, subclass); |
725 | if (likely(class)) | 724 | if (likely(class)) |
726 | return class; | 725 | goto out_set_class_cache; |
727 | 726 | ||
728 | /* | 727 | /* |
729 | * Debug-check: all keys must be persistent! | 728 | * Debug-check: all keys must be persistent! |
@@ -808,6 +807,7 @@ out_unlock_set: | |||
808 | graph_unlock(); | 807 | graph_unlock(); |
809 | raw_local_irq_restore(flags); | 808 | raw_local_irq_restore(flags); |
810 | 809 | ||
810 | out_set_class_cache: | ||
811 | if (!subclass || force) | 811 | if (!subclass || force) |
812 | lock->class_cache[0] = class; | 812 | lock->class_cache[0] = class; |
813 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | 813 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1149 | printk("\n"); | 1149 | printk("\n"); |
1150 | printk("======================================================\n"); | 1150 | printk("======================================================\n"); |
1151 | printk("[ INFO: possible circular locking dependency detected ]\n"); | 1151 | printk("[ INFO: possible circular locking dependency detected ]\n"); |
1152 | print_kernel_version(); | 1152 | print_kernel_ident(); |
1153 | printk("-------------------------------------------------------\n"); | 1153 | printk("-------------------------------------------------------\n"); |
1154 | printk("%s/%d is trying to acquire lock:\n", | 1154 | printk("%s/%d is trying to acquire lock:\n", |
1155 | curr->comm, task_pid_nr(curr)); | 1155 | curr->comm, task_pid_nr(curr)); |
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1488 | printk("======================================================\n"); | 1488 | printk("======================================================\n"); |
1489 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1489 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", |
1490 | irqclass, irqclass); | 1490 | irqclass, irqclass); |
1491 | print_kernel_version(); | 1491 | print_kernel_ident(); |
1492 | printk("------------------------------------------------------\n"); | 1492 | printk("------------------------------------------------------\n"); |
1493 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1493 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1494 | curr->comm, task_pid_nr(curr), | 1494 | curr->comm, task_pid_nr(curr), |
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1717 | printk("\n"); | 1717 | printk("\n"); |
1718 | printk("=============================================\n"); | 1718 | printk("=============================================\n"); |
1719 | printk("[ INFO: possible recursive locking detected ]\n"); | 1719 | printk("[ INFO: possible recursive locking detected ]\n"); |
1720 | print_kernel_version(); | 1720 | print_kernel_ident(); |
1721 | printk("---------------------------------------------\n"); | 1721 | printk("---------------------------------------------\n"); |
1722 | printk("%s/%d is trying to acquire lock:\n", | 1722 | printk("%s/%d is trying to acquire lock:\n", |
1723 | curr->comm, task_pid_nr(curr)); | 1723 | curr->comm, task_pid_nr(curr)); |
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2224 | printk("\n"); | 2224 | printk("\n"); |
2225 | printk("=================================\n"); | 2225 | printk("=================================\n"); |
2226 | printk("[ INFO: inconsistent lock state ]\n"); | 2226 | printk("[ INFO: inconsistent lock state ]\n"); |
2227 | print_kernel_version(); | 2227 | print_kernel_ident(); |
2228 | printk("---------------------------------\n"); | 2228 | printk("---------------------------------\n"); |
2229 | 2229 | ||
2230 | printk("inconsistent {%s} -> {%s} usage.\n", | 2230 | printk("inconsistent {%s} -> {%s} usage.\n", |
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2289 | printk("\n"); | 2289 | printk("\n"); |
2290 | printk("=========================================================\n"); | 2290 | printk("=========================================================\n"); |
2291 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | 2291 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); |
2292 | print_kernel_version(); | 2292 | print_kernel_ident(); |
2293 | printk("---------------------------------------------------------\n"); | 2293 | printk("---------------------------------------------------------\n"); |
2294 | printk("%s/%d just changed the state of lock:\n", | 2294 | printk("%s/%d just changed the state of lock:\n", |
2295 | curr->comm, task_pid_nr(curr)); | 2295 | curr->comm, task_pid_nr(curr)); |
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3175 | printk("\n"); | 3175 | printk("\n"); |
3176 | printk("=====================================\n"); | 3176 | printk("=====================================\n"); |
3177 | printk("[ BUG: bad unlock balance detected! ]\n"); | 3177 | printk("[ BUG: bad unlock balance detected! ]\n"); |
3178 | print_kernel_ident(); | ||
3178 | printk("-------------------------------------\n"); | 3179 | printk("-------------------------------------\n"); |
3179 | printk("%s/%d is trying to release lock (", | 3180 | printk("%s/%d is trying to release lock (", |
3180 | curr->comm, task_pid_nr(curr)); | 3181 | curr->comm, task_pid_nr(curr)); |
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3619 | printk("\n"); | 3620 | printk("\n"); |
3620 | printk("=================================\n"); | 3621 | printk("=================================\n"); |
3621 | printk("[ BUG: bad contention detected! ]\n"); | 3622 | printk("[ BUG: bad contention detected! ]\n"); |
3623 | print_kernel_ident(); | ||
3622 | printk("---------------------------------\n"); | 3624 | printk("---------------------------------\n"); |
3623 | printk("%s/%d is trying to contend lock (", | 3625 | printk("%s/%d is trying to contend lock (", |
3624 | curr->comm, task_pid_nr(curr)); | 3626 | curr->comm, task_pid_nr(curr)); |
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void) | |||
3974 | 3976 | ||
3975 | #ifdef CONFIG_DEBUG_LOCKDEP | 3977 | #ifdef CONFIG_DEBUG_LOCKDEP |
3976 | if (lockdep_init_error) { | 3978 | if (lockdep_init_error) { |
3977 | printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); | 3979 | printk("WARNING: lockdep init error! lock-%s was acquired" |
3980 | "before lockdep_init\n", lock_init_error); | ||
3978 | printk("Call stack leading to lockdep invocation was:\n"); | 3981 | printk("Call stack leading to lockdep invocation was:\n"); |
3979 | print_stack_trace(&lockdep_init_trace, 0); | 3982 | print_stack_trace(&lockdep_init_trace, 0); |
3980 | } | 3983 | } |
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3993 | printk("\n"); | 3996 | printk("\n"); |
3994 | printk("=========================\n"); | 3997 | printk("=========================\n"); |
3995 | printk("[ BUG: held lock freed! ]\n"); | 3998 | printk("[ BUG: held lock freed! ]\n"); |
3999 | print_kernel_ident(); | ||
3996 | printk("-------------------------\n"); | 4000 | printk("-------------------------\n"); |
3997 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4001 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
3998 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4002 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
4050 | printk("\n"); | 4054 | printk("\n"); |
4051 | printk("=====================================\n"); | 4055 | printk("=====================================\n"); |
4052 | printk("[ BUG: lock held at task exit time! ]\n"); | 4056 | printk("[ BUG: lock held at task exit time! ]\n"); |
4057 | print_kernel_ident(); | ||
4053 | printk("-------------------------------------\n"); | 4058 | printk("-------------------------------------\n"); |
4054 | printk("%s/%d is exiting with locks still held!\n", | 4059 | printk("%s/%d is exiting with locks still held!\n", |
4055 | curr->comm, task_pid_nr(curr)); | 4060 | curr->comm, task_pid_nr(curr)); |
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void) | |||
4147 | printk("\n"); | 4152 | printk("\n"); |
4148 | printk("================================================\n"); | 4153 | printk("================================================\n"); |
4149 | printk("[ BUG: lock held when returning to user space! ]\n"); | 4154 | printk("[ BUG: lock held when returning to user space! ]\n"); |
4155 | print_kernel_ident(); | ||
4150 | printk("------------------------------------------------\n"); | 4156 | printk("------------------------------------------------\n"); |
4151 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4157 | printk("%s/%d is leaving the kernel with locks still held!\n", |
4152 | curr->comm, curr->pid); | 4158 | curr->comm, curr->pid); |
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4166 | printk("\n"); | 4172 | printk("\n"); |
4167 | printk("===============================\n"); | 4173 | printk("===============================\n"); |
4168 | printk("[ INFO: suspicious RCU usage. ]\n"); | 4174 | printk("[ INFO: suspicious RCU usage. ]\n"); |
4175 | print_kernel_ident(); | ||
4169 | printk("-------------------------------\n"); | 4176 | printk("-------------------------------\n"); |
4170 | printk("%s:%d %s!\n", file, line, s); | 4177 | printk("%s:%d %s!\n", file, line, s); |
4171 | printk("\nother info that might help us debug this:\n\n"); | 4178 | printk("\nother info that might help us debug this:\n\n"); |
4172 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | 4179 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); |
4180 | |||
4181 | /* | ||
4182 | * If a CPU is in the RCU-free window in idle (ie: in the section | ||
4183 | * between rcu_idle_enter() and rcu_idle_exit(), then RCU | ||
4184 | * considers that CPU to be in an "extended quiescent state", | ||
4185 | * which means that RCU will be completely ignoring that CPU. | ||
4186 | * Therefore, rcu_read_lock() and friends have absolutely no | ||
4187 | * effect on a CPU running in that state. In other words, even if | ||
4188 | * such an RCU-idle CPU has called rcu_read_lock(), RCU might well | ||
4189 | * delete data structures out from under it. RCU really has no | ||
4190 | * choice here: we need to keep an RCU-free window in idle where | ||
4191 | * the CPU may possibly enter into low power mode. This way we can | ||
4192 | * notice an extended quiescent state to other CPUs that started a grace | ||
4193 | * period. Otherwise we would delay any grace period as long as we run | ||
4194 | * in the idle task. | ||
4195 | * | ||
4196 | * So complain bitterly if someone does call rcu_read_lock(), | ||
4197 | * rcu_read_lock_bh() and so on from extended quiescent states. | ||
4198 | */ | ||
4199 | if (rcu_is_cpu_idle()) | ||
4200 | printk("RCU used illegally from extended quiescent state!\n"); | ||
4201 | |||
4173 | lockdep_print_held_locks(curr); | 4202 | lockdep_print_held_locks(curr); |
4174 | printk("\nstack backtrace:\n"); | 4203 | printk("\nstack backtrace:\n"); |
4175 | dump_stack(); | 4204 | dump_stack(); |
diff --git a/kernel/panic.c b/kernel/panic.c index b26593604214..3458469eb7c3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -237,11 +237,20 @@ void add_taint(unsigned flag) | |||
237 | * Can't trust the integrity of the kernel anymore. | 237 | * Can't trust the integrity of the kernel anymore. |
238 | * We don't call directly debug_locks_off() because the issue | 238 | * We don't call directly debug_locks_off() because the issue |
239 | * is not necessarily serious enough to set oops_in_progress to 1 | 239 | * is not necessarily serious enough to set oops_in_progress to 1 |
240 | * Also we want to keep up lockdep for staging development and | 240 | * Also we want to keep up lockdep for staging/out-of-tree |
241 | * post-warning case. | 241 | * development and post-warning case. |
242 | */ | 242 | */ |
243 | if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) | 243 | switch (flag) { |
244 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | 244 | case TAINT_CRAP: |
245 | case TAINT_OOT_MODULE: | ||
246 | case TAINT_WARN: | ||
247 | case TAINT_FIRMWARE_WORKAROUND: | ||
248 | break; | ||
249 | |||
250 | default: | ||
251 | if (__debug_locks_off()) | ||
252 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | ||
253 | } | ||
245 | 254 | ||
246 | set_bit(flag, &tainted_mask); | 255 | set_bit(flag, &tainted_mask); |
247 | } | 256 | } |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e7cb76dc18f5..125cb67daa21 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, | |||
78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
79 | return now.sched < then.sched; | 79 | return now.sched < then.sched; |
80 | } else { | 80 | } else { |
81 | return cputime_lt(now.cpu, then.cpu); | 81 | return now.cpu < then.cpu; |
82 | } | 82 | } |
83 | } | 83 | } |
84 | static inline void cpu_time_add(const clockid_t which_clock, | 84 | static inline void cpu_time_add(const clockid_t which_clock, |
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, | |||
88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
89 | acc->sched += val.sched; | 89 | acc->sched += val.sched; |
90 | } else { | 90 | } else { |
91 | acc->cpu = cputime_add(acc->cpu, val.cpu); | 91 | acc->cpu += val.cpu; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | 94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, |
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | |||
98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
99 | a.sched -= b.sched; | 99 | a.sched -= b.sched; |
100 | } else { | 100 | } else { |
101 | a.cpu = cputime_sub(a.cpu, b.cpu); | 101 | a.cpu -= b.cpu; |
102 | } | 102 | } |
103 | return a; | 103 | return a; |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * Divide and limit the result to res >= 1 | ||
108 | * | ||
109 | * This is necessary to prevent signal delivery starvation, when the result of | ||
110 | * the division would be rounded down to 0. | ||
111 | */ | ||
112 | static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) | ||
113 | { | ||
114 | cputime_t res = cputime_div(time, div); | ||
115 | |||
116 | return max_t(cputime_t, res, 1); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Update expiry time from increment, and increase overrun count, | 107 | * Update expiry time from increment, and increase overrun count, |
121 | * given the current clock sample. | 108 | * given the current clock sample. |
122 | */ | 109 | */ |
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
148 | } else { | 135 | } else { |
149 | cputime_t delta, incr; | 136 | cputime_t delta, incr; |
150 | 137 | ||
151 | if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) | 138 | if (now.cpu < timer->it.cpu.expires.cpu) |
152 | return; | 139 | return; |
153 | incr = timer->it.cpu.incr.cpu; | 140 | incr = timer->it.cpu.incr.cpu; |
154 | delta = cputime_sub(cputime_add(now.cpu, incr), | 141 | delta = now.cpu + incr - timer->it.cpu.expires.cpu; |
155 | timer->it.cpu.expires.cpu); | ||
156 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | 142 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
157 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | 143 | for (i = 0; incr < delta - incr; i++) |
158 | incr = cputime_add(incr, incr); | 144 | incr += incr; |
159 | for (; i >= 0; incr = cputime_halve(incr), i--) { | 145 | for (; i >= 0; incr = incr >> 1, i--) { |
160 | if (cputime_lt(delta, incr)) | 146 | if (delta < incr) |
161 | continue; | 147 | continue; |
162 | timer->it.cpu.expires.cpu = | 148 | timer->it.cpu.expires.cpu += incr; |
163 | cputime_add(timer->it.cpu.expires.cpu, incr); | ||
164 | timer->it_overrun += 1 << i; | 149 | timer->it_overrun += 1 << i; |
165 | delta = cputime_sub(delta, incr); | 150 | delta -= incr; |
166 | } | 151 | } |
167 | } | 152 | } |
168 | } | 153 | } |
169 | 154 | ||
170 | static inline cputime_t prof_ticks(struct task_struct *p) | 155 | static inline cputime_t prof_ticks(struct task_struct *p) |
171 | { | 156 | { |
172 | return cputime_add(p->utime, p->stime); | 157 | return p->utime + p->stime; |
173 | } | 158 | } |
174 | static inline cputime_t virt_ticks(struct task_struct *p) | 159 | static inline cputime_t virt_ticks(struct task_struct *p) |
175 | { | 160 | { |
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
248 | 233 | ||
249 | t = tsk; | 234 | t = tsk; |
250 | do { | 235 | do { |
251 | times->utime = cputime_add(times->utime, t->utime); | 236 | times->utime += t->utime; |
252 | times->stime = cputime_add(times->stime, t->stime); | 237 | times->stime += t->stime; |
253 | times->sum_exec_runtime += task_sched_runtime(t); | 238 | times->sum_exec_runtime += task_sched_runtime(t); |
254 | } while_each_thread(tsk, t); | 239 | } while_each_thread(tsk, t); |
255 | out: | 240 | out: |
@@ -258,10 +243,10 @@ out: | |||
258 | 243 | ||
259 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
260 | { | 245 | { |
261 | if (cputime_gt(b->utime, a->utime)) | 246 | if (b->utime > a->utime) |
262 | a->utime = b->utime; | 247 | a->utime = b->utime; |
263 | 248 | ||
264 | if (cputime_gt(b->stime, a->stime)) | 249 | if (b->stime > a->stime) |
265 | a->stime = b->stime; | 250 | a->stime = b->stime; |
266 | 251 | ||
267 | if (b->sum_exec_runtime > a->sum_exec_runtime) | 252 | if (b->sum_exec_runtime > a->sum_exec_runtime) |
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
306 | return -EINVAL; | 291 | return -EINVAL; |
307 | case CPUCLOCK_PROF: | 292 | case CPUCLOCK_PROF: |
308 | thread_group_cputime(p, &cputime); | 293 | thread_group_cputime(p, &cputime); |
309 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 294 | cpu->cpu = cputime.utime + cputime.stime; |
310 | break; | 295 | break; |
311 | case CPUCLOCK_VIRT: | 296 | case CPUCLOCK_VIRT: |
312 | thread_group_cputime(p, &cputime); | 297 | thread_group_cputime(p, &cputime); |
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head, | |||
470 | unsigned long long sum_exec_runtime) | 455 | unsigned long long sum_exec_runtime) |
471 | { | 456 | { |
472 | struct cpu_timer_list *timer, *next; | 457 | struct cpu_timer_list *timer, *next; |
473 | cputime_t ptime = cputime_add(utime, stime); | 458 | cputime_t ptime = utime + stime; |
474 | 459 | ||
475 | list_for_each_entry_safe(timer, next, head, entry) { | 460 | list_for_each_entry_safe(timer, next, head, entry) { |
476 | list_del_init(&timer->entry); | 461 | list_del_init(&timer->entry); |
477 | if (cputime_lt(timer->expires.cpu, ptime)) { | 462 | if (timer->expires.cpu < ptime) { |
478 | timer->expires.cpu = cputime_zero; | 463 | timer->expires.cpu = 0; |
479 | } else { | 464 | } else { |
480 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 465 | timer->expires.cpu -= ptime; |
481 | ptime); | ||
482 | } | 466 | } |
483 | } | 467 | } |
484 | 468 | ||
485 | ++head; | 469 | ++head; |
486 | list_for_each_entry_safe(timer, next, head, entry) { | 470 | list_for_each_entry_safe(timer, next, head, entry) { |
487 | list_del_init(&timer->entry); | 471 | list_del_init(&timer->entry); |
488 | if (cputime_lt(timer->expires.cpu, utime)) { | 472 | if (timer->expires.cpu < utime) { |
489 | timer->expires.cpu = cputime_zero; | 473 | timer->expires.cpu = 0; |
490 | } else { | 474 | } else { |
491 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 475 | timer->expires.cpu -= utime; |
492 | utime); | ||
493 | } | 476 | } |
494 | } | 477 | } |
495 | 478 | ||
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
520 | struct signal_struct *const sig = tsk->signal; | 503 | struct signal_struct *const sig = tsk->signal; |
521 | 504 | ||
522 | cleanup_timers(tsk->signal->cpu_timers, | 505 | cleanup_timers(tsk->signal->cpu_timers, |
523 | cputime_add(tsk->utime, sig->utime), | 506 | tsk->utime + sig->utime, tsk->stime + sig->stime, |
524 | cputime_add(tsk->stime, sig->stime), | ||
525 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | 507 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
526 | } | 508 | } |
527 | 509 | ||
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | |||
540 | 522 | ||
541 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 523 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) |
542 | { | 524 | { |
543 | return cputime_eq(expires, cputime_zero) || | 525 | return expires == 0 || expires > new_exp; |
544 | cputime_gt(expires, new_exp); | ||
545 | } | 526 | } |
546 | 527 | ||
547 | /* | 528 | /* |
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
651 | default: | 632 | default: |
652 | return -EINVAL; | 633 | return -EINVAL; |
653 | case CPUCLOCK_PROF: | 634 | case CPUCLOCK_PROF: |
654 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 635 | cpu->cpu = cputime.utime + cputime.stime; |
655 | break; | 636 | break; |
656 | case CPUCLOCK_VIRT: | 637 | case CPUCLOCK_VIRT: |
657 | cpu->cpu = cputime.utime; | 638 | cpu->cpu = cputime.utime; |
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
918 | unsigned long soft; | 899 | unsigned long soft; |
919 | 900 | ||
920 | maxfire = 20; | 901 | maxfire = 20; |
921 | tsk->cputime_expires.prof_exp = cputime_zero; | 902 | tsk->cputime_expires.prof_exp = 0; |
922 | while (!list_empty(timers)) { | 903 | while (!list_empty(timers)) { |
923 | struct cpu_timer_list *t = list_first_entry(timers, | 904 | struct cpu_timer_list *t = list_first_entry(timers, |
924 | struct cpu_timer_list, | 905 | struct cpu_timer_list, |
925 | entry); | 906 | entry); |
926 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 907 | if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { |
927 | tsk->cputime_expires.prof_exp = t->expires.cpu; | 908 | tsk->cputime_expires.prof_exp = t->expires.cpu; |
928 | break; | 909 | break; |
929 | } | 910 | } |
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
933 | 914 | ||
934 | ++timers; | 915 | ++timers; |
935 | maxfire = 20; | 916 | maxfire = 20; |
936 | tsk->cputime_expires.virt_exp = cputime_zero; | 917 | tsk->cputime_expires.virt_exp = 0; |
937 | while (!list_empty(timers)) { | 918 | while (!list_empty(timers)) { |
938 | struct cpu_timer_list *t = list_first_entry(timers, | 919 | struct cpu_timer_list *t = list_first_entry(timers, |
939 | struct cpu_timer_list, | 920 | struct cpu_timer_list, |
940 | entry); | 921 | entry); |
941 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 922 | if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { |
942 | tsk->cputime_expires.virt_exp = t->expires.cpu; | 923 | tsk->cputime_expires.virt_exp = t->expires.cpu; |
943 | break; | 924 | break; |
944 | } | 925 | } |
@@ -1009,20 +990,19 @@ static u32 onecputick; | |||
1009 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | 990 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
1010 | cputime_t *expires, cputime_t cur_time, int signo) | 991 | cputime_t *expires, cputime_t cur_time, int signo) |
1011 | { | 992 | { |
1012 | if (cputime_eq(it->expires, cputime_zero)) | 993 | if (!it->expires) |
1013 | return; | 994 | return; |
1014 | 995 | ||
1015 | if (cputime_ge(cur_time, it->expires)) { | 996 | if (cur_time >= it->expires) { |
1016 | if (!cputime_eq(it->incr, cputime_zero)) { | 997 | if (it->incr) { |
1017 | it->expires = cputime_add(it->expires, it->incr); | 998 | it->expires += it->incr; |
1018 | it->error += it->incr_error; | 999 | it->error += it->incr_error; |
1019 | if (it->error >= onecputick) { | 1000 | if (it->error >= onecputick) { |
1020 | it->expires = cputime_sub(it->expires, | 1001 | it->expires -= cputime_one_jiffy; |
1021 | cputime_one_jiffy); | ||
1022 | it->error -= onecputick; | 1002 | it->error -= onecputick; |
1023 | } | 1003 | } |
1024 | } else { | 1004 | } else { |
1025 | it->expires = cputime_zero; | 1005 | it->expires = 0; |
1026 | } | 1006 | } |
1027 | 1007 | ||
1028 | trace_itimer_expire(signo == SIGPROF ? | 1008 | trace_itimer_expire(signo == SIGPROF ? |
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1031 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); | 1011 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); |
1032 | } | 1012 | } |
1033 | 1013 | ||
1034 | if (!cputime_eq(it->expires, cputime_zero) && | 1014 | if (it->expires && (!*expires || it->expires < *expires)) { |
1035 | (cputime_eq(*expires, cputime_zero) || | ||
1036 | cputime_lt(it->expires, *expires))) { | ||
1037 | *expires = it->expires; | 1015 | *expires = it->expires; |
1038 | } | 1016 | } |
1039 | } | 1017 | } |
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1048 | */ | 1026 | */ |
1049 | static inline int task_cputime_zero(const struct task_cputime *cputime) | 1027 | static inline int task_cputime_zero(const struct task_cputime *cputime) |
1050 | { | 1028 | { |
1051 | if (cputime_eq(cputime->utime, cputime_zero) && | 1029 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) |
1052 | cputime_eq(cputime->stime, cputime_zero) && | ||
1053 | cputime->sum_exec_runtime == 0) | ||
1054 | return 1; | 1030 | return 1; |
1055 | return 0; | 1031 | return 0; |
1056 | } | 1032 | } |
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk, | |||
1076 | */ | 1052 | */ |
1077 | thread_group_cputimer(tsk, &cputime); | 1053 | thread_group_cputimer(tsk, &cputime); |
1078 | utime = cputime.utime; | 1054 | utime = cputime.utime; |
1079 | ptime = cputime_add(utime, cputime.stime); | 1055 | ptime = utime + cputime.stime; |
1080 | sum_sched_runtime = cputime.sum_exec_runtime; | 1056 | sum_sched_runtime = cputime.sum_exec_runtime; |
1081 | maxfire = 20; | 1057 | maxfire = 20; |
1082 | prof_expires = cputime_zero; | 1058 | prof_expires = 0; |
1083 | while (!list_empty(timers)) { | 1059 | while (!list_empty(timers)) { |
1084 | struct cpu_timer_list *tl = list_first_entry(timers, | 1060 | struct cpu_timer_list *tl = list_first_entry(timers, |
1085 | struct cpu_timer_list, | 1061 | struct cpu_timer_list, |
1086 | entry); | 1062 | entry); |
1087 | if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { | 1063 | if (!--maxfire || ptime < tl->expires.cpu) { |
1088 | prof_expires = tl->expires.cpu; | 1064 | prof_expires = tl->expires.cpu; |
1089 | break; | 1065 | break; |
1090 | } | 1066 | } |
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk, | |||
1094 | 1070 | ||
1095 | ++timers; | 1071 | ++timers; |
1096 | maxfire = 20; | 1072 | maxfire = 20; |
1097 | virt_expires = cputime_zero; | 1073 | virt_expires = 0; |
1098 | while (!list_empty(timers)) { | 1074 | while (!list_empty(timers)) { |
1099 | struct cpu_timer_list *tl = list_first_entry(timers, | 1075 | struct cpu_timer_list *tl = list_first_entry(timers, |
1100 | struct cpu_timer_list, | 1076 | struct cpu_timer_list, |
1101 | entry); | 1077 | entry); |
1102 | if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { | 1078 | if (!--maxfire || utime < tl->expires.cpu) { |
1103 | virt_expires = tl->expires.cpu; | 1079 | virt_expires = tl->expires.cpu; |
1104 | break; | 1080 | break; |
1105 | } | 1081 | } |
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1154 | } | 1130 | } |
1155 | } | 1131 | } |
1156 | x = secs_to_cputime(soft); | 1132 | x = secs_to_cputime(soft); |
1157 | if (cputime_eq(prof_expires, cputime_zero) || | 1133 | if (!prof_expires || x < prof_expires) { |
1158 | cputime_lt(x, prof_expires)) { | ||
1159 | prof_expires = x; | 1134 | prof_expires = x; |
1160 | } | 1135 | } |
1161 | } | 1136 | } |
@@ -1249,12 +1224,9 @@ out: | |||
1249 | static inline int task_cputime_expired(const struct task_cputime *sample, | 1224 | static inline int task_cputime_expired(const struct task_cputime *sample, |
1250 | const struct task_cputime *expires) | 1225 | const struct task_cputime *expires) |
1251 | { | 1226 | { |
1252 | if (!cputime_eq(expires->utime, cputime_zero) && | 1227 | if (expires->utime && sample->utime >= expires->utime) |
1253 | cputime_ge(sample->utime, expires->utime)) | ||
1254 | return 1; | 1228 | return 1; |
1255 | if (!cputime_eq(expires->stime, cputime_zero) && | 1229 | if (expires->stime && sample->utime + sample->stime >= expires->stime) |
1256 | cputime_ge(cputime_add(sample->utime, sample->stime), | ||
1257 | expires->stime)) | ||
1258 | return 1; | 1230 | return 1; |
1259 | if (expires->sum_exec_runtime != 0 && | 1231 | if (expires->sum_exec_runtime != 0 && |
1260 | sample->sum_exec_runtime >= expires->sum_exec_runtime) | 1232 | sample->sum_exec_runtime >= expires->sum_exec_runtime) |
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1389 | * it to be relative, *newval argument is relative and we update | 1361 | * it to be relative, *newval argument is relative and we update |
1390 | * it to be absolute. | 1362 | * it to be absolute. |
1391 | */ | 1363 | */ |
1392 | if (!cputime_eq(*oldval, cputime_zero)) { | 1364 | if (*oldval) { |
1393 | if (cputime_le(*oldval, now.cpu)) { | 1365 | if (*oldval <= now.cpu) { |
1394 | /* Just about to fire. */ | 1366 | /* Just about to fire. */ |
1395 | *oldval = cputime_one_jiffy; | 1367 | *oldval = cputime_one_jiffy; |
1396 | } else { | 1368 | } else { |
1397 | *oldval = cputime_sub(*oldval, now.cpu); | 1369 | *oldval -= now.cpu; |
1398 | } | 1370 | } |
1399 | } | 1371 | } |
1400 | 1372 | ||
1401 | if (cputime_eq(*newval, cputime_zero)) | 1373 | if (!*newval) |
1402 | return; | 1374 | return; |
1403 | *newval = cputime_add(*newval, now.cpu); | 1375 | *newval += now.cpu; |
1404 | } | 1376 | } |
1405 | 1377 | ||
1406 | /* | 1378 | /* |
diff --git a/kernel/printk.c b/kernel/printk.c index 7982a0a841ea..989e4a52da76 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early) | |||
199 | unsigned long mem; | 199 | unsigned long mem; |
200 | 200 | ||
201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); | 201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
202 | if (mem == MEMBLOCK_ERROR) | 202 | if (!mem) |
203 | return; | 203 | return; |
204 | new_log_buf = __va(mem); | 204 | new_log_buf = __va(mem); |
205 | } else { | 205 | } else { |
@@ -688,6 +688,7 @@ static void zap_locks(void) | |||
688 | 688 | ||
689 | oops_timestamp = jiffies; | 689 | oops_timestamp = jiffies; |
690 | 690 | ||
691 | debug_locks_off(); | ||
691 | /* If a crash is occurring, make sure we can't deadlock */ | 692 | /* If a crash is occurring, make sure we can't deadlock */ |
692 | raw_spin_lock_init(&logbuf_lock); | 693 | raw_spin_lock_init(&logbuf_lock); |
693 | /* And make sure that we print immediately */ | 694 | /* And make sure that we print immediately */ |
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
840 | boot_delay_msec(); | 841 | boot_delay_msec(); |
841 | printk_delay(); | 842 | printk_delay(); |
842 | 843 | ||
843 | preempt_disable(); | ||
844 | /* This stops the holder of console_sem just where we want him */ | 844 | /* This stops the holder of console_sem just where we want him */ |
845 | raw_local_irq_save(flags); | 845 | local_irq_save(flags); |
846 | this_cpu = smp_processor_id(); | 846 | this_cpu = smp_processor_id(); |
847 | 847 | ||
848 | /* | 848 | /* |
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | * recursion and return - but flag the recursion so that | 856 | * recursion and return - but flag the recursion so that |
857 | * it can be printed at the next appropriate moment: | 857 | * it can be printed at the next appropriate moment: |
858 | */ | 858 | */ |
859 | if (!oops_in_progress) { | 859 | if (!oops_in_progress && !lockdep_recursing(current)) { |
860 | recursion_bug = 1; | 860 | recursion_bug = 1; |
861 | goto out_restore_irqs; | 861 | goto out_restore_irqs; |
862 | } | 862 | } |
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
962 | 962 | ||
963 | lockdep_on(); | 963 | lockdep_on(); |
964 | out_restore_irqs: | 964 | out_restore_irqs: |
965 | raw_local_irq_restore(flags); | 965 | local_irq_restore(flags); |
966 | 966 | ||
967 | preempt_enable(); | ||
968 | return printed_len; | 967 | return printed_len; |
969 | } | 968 | } |
970 | EXPORT_SYMBOL(printk); | 969 | EXPORT_SYMBOL(printk); |
diff --git a/kernel/rcu.h b/kernel/rcu.h index f600868d550d..aa88baab5f78 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -30,6 +30,13 @@ | |||
30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Process-level increment to ->dynticks_nesting field. This allows for | ||
34 | * architectures that use half-interrupts and half-exceptions from | ||
35 | * process context. | ||
36 | */ | ||
37 | #define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) | ||
38 | |||
39 | /* | ||
33 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 40 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
34 | * by call_rcu() and rcu callback execution, and are therefore not part of the | 41 | * by call_rcu() and rcu callback execution, and are therefore not part of the |
35 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. | 42 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c5b98e565aee..2bc4e135ff23 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void) | |||
93 | { | 93 | { |
94 | if (!debug_lockdep_rcu_enabled()) | 94 | if (!debug_lockdep_rcu_enabled()) |
95 | return 1; | 95 | return 1; |
96 | if (rcu_is_cpu_idle()) | ||
97 | return 0; | ||
96 | return in_softirq() || irqs_disabled(); | 98 | return in_softirq() || irqs_disabled(); |
97 | } | 99 | } |
98 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 100 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = { | |||
316 | }; | 318 | }; |
317 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 319 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
318 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 320 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
321 | |||
322 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | ||
323 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) | ||
324 | { | ||
325 | trace_rcu_torture_read(rcutorturename, rhp); | ||
326 | } | ||
327 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | ||
328 | #else | ||
329 | #define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) | ||
330 | #endif | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 636af6d9c6e5..977296dca0a4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head, | |||
53 | 53 | ||
54 | #include "rcutiny_plugin.h" | 54 | #include "rcutiny_plugin.h" |
55 | 55 | ||
56 | #ifdef CONFIG_NO_HZ | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; |
57 | 57 | ||
58 | static long rcu_dynticks_nesting = 1; | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
59 | static void rcu_idle_enter_common(long long oldval) | ||
60 | { | ||
61 | if (rcu_dynticks_nesting) { | ||
62 | RCU_TRACE(trace_rcu_dyntick("--=", | ||
63 | oldval, rcu_dynticks_nesting)); | ||
64 | return; | ||
65 | } | ||
66 | RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); | ||
67 | if (!is_idle_task(current)) { | ||
68 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
69 | |||
70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | ||
71 | oldval, rcu_dynticks_nesting)); | ||
72 | ftrace_dump(DUMP_ALL); | ||
73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
74 | current->pid, current->comm, | ||
75 | idle->pid, idle->comm); /* must be idle task! */ | ||
76 | } | ||
77 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | ||
78 | } | ||
59 | 79 | ||
60 | /* | 80 | /* |
61 | * Enter dynticks-idle mode, which is an extended quiescent state | 81 | * Enter idle, which is an extended quiescent state if we have fully |
62 | * if we have fully entered that mode (i.e., if the new value of | 82 | * entered that mode (i.e., if the new value of dynticks_nesting is zero). |
63 | * dynticks_nesting is zero). | ||
64 | */ | 83 | */ |
65 | void rcu_enter_nohz(void) | 84 | void rcu_idle_enter(void) |
66 | { | 85 | { |
67 | if (--rcu_dynticks_nesting == 0) | 86 | unsigned long flags; |
68 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 87 | long long oldval; |
88 | |||
89 | local_irq_save(flags); | ||
90 | oldval = rcu_dynticks_nesting; | ||
91 | rcu_dynticks_nesting = 0; | ||
92 | rcu_idle_enter_common(oldval); | ||
93 | local_irq_restore(flags); | ||
69 | } | 94 | } |
70 | 95 | ||
71 | /* | 96 | /* |
72 | * Exit dynticks-idle mode, so that we are no longer in an extended | 97 | * Exit an interrupt handler towards idle. |
73 | * quiescent state. | ||
74 | */ | 98 | */ |
75 | void rcu_exit_nohz(void) | 99 | void rcu_irq_exit(void) |
100 | { | ||
101 | unsigned long flags; | ||
102 | long long oldval; | ||
103 | |||
104 | local_irq_save(flags); | ||
105 | oldval = rcu_dynticks_nesting; | ||
106 | rcu_dynticks_nesting--; | ||
107 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | ||
108 | rcu_idle_enter_common(oldval); | ||
109 | local_irq_restore(flags); | ||
110 | } | ||
111 | |||
112 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | ||
113 | static void rcu_idle_exit_common(long long oldval) | ||
76 | { | 114 | { |
115 | if (oldval) { | ||
116 | RCU_TRACE(trace_rcu_dyntick("++=", | ||
117 | oldval, rcu_dynticks_nesting)); | ||
118 | return; | ||
119 | } | ||
120 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | ||
121 | if (!is_idle_task(current)) { | ||
122 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
123 | |||
124 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | ||
125 | oldval, rcu_dynticks_nesting)); | ||
126 | ftrace_dump(DUMP_ALL); | ||
127 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
128 | current->pid, current->comm, | ||
129 | idle->pid, idle->comm); /* must be idle task! */ | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Exit idle, so that we are no longer in an extended quiescent state. | ||
135 | */ | ||
136 | void rcu_idle_exit(void) | ||
137 | { | ||
138 | unsigned long flags; | ||
139 | long long oldval; | ||
140 | |||
141 | local_irq_save(flags); | ||
142 | oldval = rcu_dynticks_nesting; | ||
143 | WARN_ON_ONCE(oldval != 0); | ||
144 | rcu_dynticks_nesting = DYNTICK_TASK_NESTING; | ||
145 | rcu_idle_exit_common(oldval); | ||
146 | local_irq_restore(flags); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Enter an interrupt handler, moving away from idle. | ||
151 | */ | ||
152 | void rcu_irq_enter(void) | ||
153 | { | ||
154 | unsigned long flags; | ||
155 | long long oldval; | ||
156 | |||
157 | local_irq_save(flags); | ||
158 | oldval = rcu_dynticks_nesting; | ||
77 | rcu_dynticks_nesting++; | 159 | rcu_dynticks_nesting++; |
160 | WARN_ON_ONCE(rcu_dynticks_nesting == 0); | ||
161 | rcu_idle_exit_common(oldval); | ||
162 | local_irq_restore(flags); | ||
163 | } | ||
164 | |||
165 | #ifdef CONFIG_PROVE_RCU | ||
166 | |||
167 | /* | ||
168 | * Test whether RCU thinks that the current CPU is idle. | ||
169 | */ | ||
170 | int rcu_is_cpu_idle(void) | ||
171 | { | ||
172 | return !rcu_dynticks_nesting; | ||
78 | } | 173 | } |
174 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
175 | |||
176 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
79 | 177 | ||
80 | #endif /* #ifdef CONFIG_NO_HZ */ | 178 | /* |
179 | * Test whether the current CPU was interrupted from idle. Nested | ||
180 | * interrupts don't count, we must be running at the first interrupt | ||
181 | * level. | ||
182 | */ | ||
183 | int rcu_is_cpu_rrupt_from_idle(void) | ||
184 | { | ||
185 | return rcu_dynticks_nesting <= 0; | ||
186 | } | ||
81 | 187 | ||
82 | /* | 188 | /* |
83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). | 189 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu) | |||
126 | 232 | ||
127 | /* | 233 | /* |
128 | * Check to see if the scheduling-clock interrupt came from an extended | 234 | * Check to see if the scheduling-clock interrupt came from an extended |
129 | * quiescent state, and, if so, tell RCU about it. | 235 | * quiescent state, and, if so, tell RCU about it. This function must |
236 | * be called from hardirq context. It is normally called from the | ||
237 | * scheduling-clock interrupt. | ||
130 | */ | 238 | */ |
131 | void rcu_check_callbacks(int cpu, int user) | 239 | void rcu_check_callbacks(int cpu, int user) |
132 | { | 240 | { |
133 | if (user || | 241 | if (user || rcu_is_cpu_rrupt_from_idle()) |
134 | (idle_cpu(cpu) && | ||
135 | !in_softirq() && | ||
136 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) | ||
137 | rcu_sched_qs(cpu); | 242 | rcu_sched_qs(cpu); |
138 | else if (!in_softirq()) | 243 | else if (!in_softirq()) |
139 | rcu_bh_qs(cpu); | 244 | rcu_bh_qs(cpu); |
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
154 | /* If no RCU callbacks ready to invoke, just return. */ | 259 | /* If no RCU callbacks ready to invoke, just return. */ |
155 | if (&rcp->rcucblist == rcp->donetail) { | 260 | if (&rcp->rcucblist == rcp->donetail) { |
156 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | 261 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); |
157 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); | 262 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
263 | ACCESS_ONCE(rcp->rcucblist), | ||
264 | need_resched(), | ||
265 | is_idle_task(current), | ||
266 | rcu_is_callbacks_kthread())); | ||
158 | return; | 267 | return; |
159 | } | 268 | } |
160 | 269 | ||
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
183 | RCU_TRACE(cb_count++); | 292 | RCU_TRACE(cb_count++); |
184 | } | 293 | } |
185 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 294 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
186 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); | 295 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), |
296 | is_idle_task(current), | ||
297 | rcu_is_callbacks_kthread())); | ||
187 | } | 298 | } |
188 | 299 | ||
189 | static void rcu_process_callbacks(struct softirq_action *unused) | 300 | static void rcu_process_callbacks(struct softirq_action *unused) |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 2b0484a5dc28..9cb1ae4aabdd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -312,8 +312,8 @@ static int rcu_boost(void) | |||
312 | rt_mutex_lock(&mtx); | 312 | rt_mutex_lock(&mtx); |
313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
314 | 314 | ||
315 | return rcu_preempt_ctrlblk.boost_tasks != NULL || | 315 | return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || |
316 | rcu_preempt_ctrlblk.exp_tasks != NULL; | 316 | ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; |
317 | } | 317 | } |
318 | 318 | ||
319 | /* | 319 | /* |
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void) | |||
885 | wake_up(&rcu_kthread_wq); | 885 | wake_up(&rcu_kthread_wq); |
886 | } | 886 | } |
887 | 887 | ||
888 | #ifdef CONFIG_RCU_TRACE | ||
889 | |||
890 | /* | ||
891 | * Is the current CPU running the RCU-callbacks kthread? | ||
892 | * Caller must have preemption disabled. | ||
893 | */ | ||
894 | static bool rcu_is_callbacks_kthread(void) | ||
895 | { | ||
896 | return rcu_kthread_task == current; | ||
897 | } | ||
898 | |||
899 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
900 | |||
888 | /* | 901 | /* |
889 | * This kthread invokes RCU callbacks whose grace periods have | 902 | * This kthread invokes RCU callbacks whose grace periods have |
890 | * elapsed. It is awakened as needed, and takes the place of the | 903 | * elapsed. It is awakened as needed, and takes the place of the |
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void) | |||
938 | raise_softirq(RCU_SOFTIRQ); | 951 | raise_softirq(RCU_SOFTIRQ); |
939 | } | 952 | } |
940 | 953 | ||
954 | #ifdef CONFIG_RCU_TRACE | ||
955 | |||
956 | /* | ||
957 | * There is no callback kthread, so this thread is never it. | ||
958 | */ | ||
959 | static bool rcu_is_callbacks_kthread(void) | ||
960 | { | ||
961 | return false; | ||
962 | } | ||
963 | |||
964 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
965 | |||
941 | void rcu_init(void) | 966 | void rcu_init(void) |
942 | { | 967 | { |
943 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 968 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 764825c2685c..88f17b8a3b1d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | |||
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
68 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | 69 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ |
68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | 70 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ |
69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | 71 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ |
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444); | |||
91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 93 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
92 | module_param(fqs_stutter, int, 0444); | 94 | module_param(fqs_stutter, int, 0444); |
93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 95 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
96 | module_param(onoff_interval, int, 0444); | ||
97 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
98 | module_param(shutdown_secs, int, 0444); | ||
99 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | ||
94 | module_param(test_boost, int, 0444); | 100 | module_param(test_boost, int, 0444); |
95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 101 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
96 | module_param(test_boost_interval, int, 0444); | 102 | module_param(test_boost_interval, int, 0444); |
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task; | |||
119 | static struct task_struct *stutter_task; | 125 | static struct task_struct *stutter_task; |
120 | static struct task_struct *fqs_task; | 126 | static struct task_struct *fqs_task; |
121 | static struct task_struct *boost_tasks[NR_CPUS]; | 127 | static struct task_struct *boost_tasks[NR_CPUS]; |
128 | static struct task_struct *shutdown_task; | ||
129 | #ifdef CONFIG_HOTPLUG_CPU | ||
130 | static struct task_struct *onoff_task; | ||
131 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
122 | 132 | ||
123 | #define RCU_TORTURE_PIPE_LEN 10 | 133 | #define RCU_TORTURE_PIPE_LEN 10 |
124 | 134 | ||
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror; | |||
149 | static long n_rcu_torture_boost_failure; | 159 | static long n_rcu_torture_boost_failure; |
150 | static long n_rcu_torture_boosts; | 160 | static long n_rcu_torture_boosts; |
151 | static long n_rcu_torture_timers; | 161 | static long n_rcu_torture_timers; |
162 | static long n_offline_attempts; | ||
163 | static long n_offline_successes; | ||
164 | static long n_online_attempts; | ||
165 | static long n_online_successes; | ||
152 | static struct list_head rcu_torture_removed; | 166 | static struct list_head rcu_torture_removed; |
153 | static cpumask_var_t shuffle_tmp_mask; | 167 | static cpumask_var_t shuffle_tmp_mask; |
154 | 168 | ||
@@ -160,6 +174,8 @@ static int stutter_pause_test; | |||
160 | #define RCUTORTURE_RUNNABLE_INIT 0 | 174 | #define RCUTORTURE_RUNNABLE_INIT 0 |
161 | #endif | 175 | #endif |
162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 176 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
177 | module_param(rcutorture_runnable, int, 0444); | ||
178 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | ||
163 | 179 | ||
164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 180 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
165 | #define rcu_can_boost() 1 | 181 | #define rcu_can_boost() 1 |
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
167 | #define rcu_can_boost() 0 | 183 | #define rcu_can_boost() 0 |
168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 184 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | 185 | ||
186 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | ||
170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 187 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 188 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
172 | /* and boost task create/destroy. */ | 189 | /* and boost task create/destroy. */ |
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD; | |||
182 | */ | 199 | */ |
183 | static DEFINE_MUTEX(fullstop_mutex); | 200 | static DEFINE_MUTEX(fullstop_mutex); |
184 | 201 | ||
202 | /* Forward reference. */ | ||
203 | static void rcu_torture_cleanup(void); | ||
204 | |||
185 | /* | 205 | /* |
186 | * Detect and respond to a system shutdown. | 206 | * Detect and respond to a system shutdown. |
187 | */ | 207 | */ |
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = { | |||
612 | .name = "srcu" | 632 | .name = "srcu" |
613 | }; | 633 | }; |
614 | 634 | ||
635 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | ||
636 | { | ||
637 | return srcu_read_lock_raw(&srcu_ctl); | ||
638 | } | ||
639 | |||
640 | static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | ||
641 | { | ||
642 | srcu_read_unlock_raw(&srcu_ctl, idx); | ||
643 | } | ||
644 | |||
645 | static struct rcu_torture_ops srcu_raw_ops = { | ||
646 | .init = srcu_torture_init, | ||
647 | .cleanup = srcu_torture_cleanup, | ||
648 | .readlock = srcu_torture_read_lock_raw, | ||
649 | .read_delay = srcu_read_delay, | ||
650 | .readunlock = srcu_torture_read_unlock_raw, | ||
651 | .completed = srcu_torture_completed, | ||
652 | .deferred_free = rcu_sync_torture_deferred_free, | ||
653 | .sync = srcu_torture_synchronize, | ||
654 | .cb_barrier = NULL, | ||
655 | .stats = srcu_torture_stats, | ||
656 | .name = "srcu_raw" | ||
657 | }; | ||
658 | |||
615 | static void srcu_torture_synchronize_expedited(void) | 659 | static void srcu_torture_synchronize_expedited(void) |
616 | { | 660 | { |
617 | synchronize_srcu_expedited(&srcu_ctl); | 661 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg) | |||
913 | return 0; | 957 | return 0; |
914 | } | 958 | } |
915 | 959 | ||
960 | void rcutorture_trace_dump(void) | ||
961 | { | ||
962 | static atomic_t beenhere = ATOMIC_INIT(0); | ||
963 | |||
964 | if (atomic_read(&beenhere)) | ||
965 | return; | ||
966 | if (atomic_xchg(&beenhere, 1) != 0) | ||
967 | return; | ||
968 | do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); | ||
969 | ftrace_dump(DUMP_ALL); | ||
970 | } | ||
971 | |||
916 | /* | 972 | /* |
917 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, | 973 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, |
918 | * incrementing the corresponding element of the pipeline array. The | 974 | * incrementing the corresponding element of the pipeline array. The |
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused) | |||
934 | rcu_read_lock_bh_held() || | 990 | rcu_read_lock_bh_held() || |
935 | rcu_read_lock_sched_held() || | 991 | rcu_read_lock_sched_held() || |
936 | srcu_read_lock_held(&srcu_ctl)); | 992 | srcu_read_lock_held(&srcu_ctl)); |
993 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
937 | if (p == NULL) { | 994 | if (p == NULL) { |
938 | /* Leave because rcu_torture_writer is not yet underway */ | 995 | /* Leave because rcu_torture_writer is not yet underway */ |
939 | cur_ops->readunlock(idx); | 996 | cur_ops->readunlock(idx); |
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused) | |||
951 | /* Should not happen, but... */ | 1008 | /* Should not happen, but... */ |
952 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1009 | pipe_count = RCU_TORTURE_PIPE_LEN; |
953 | } | 1010 | } |
1011 | if (pipe_count > 1) | ||
1012 | rcutorture_trace_dump(); | ||
954 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1013 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
955 | completed = cur_ops->completed() - completed; | 1014 | completed = cur_ops->completed() - completed; |
956 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1015 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg) | |||
994 | rcu_read_lock_bh_held() || | 1053 | rcu_read_lock_bh_held() || |
995 | rcu_read_lock_sched_held() || | 1054 | rcu_read_lock_sched_held() || |
996 | srcu_read_lock_held(&srcu_ctl)); | 1055 | srcu_read_lock_held(&srcu_ctl)); |
1056 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
997 | if (p == NULL) { | 1057 | if (p == NULL) { |
998 | /* Wait for rcu_torture_writer to get underway */ | 1058 | /* Wait for rcu_torture_writer to get underway */ |
999 | cur_ops->readunlock(idx); | 1059 | cur_ops->readunlock(idx); |
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg) | |||
1009 | /* Should not happen, but... */ | 1069 | /* Should not happen, but... */ |
1010 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1070 | pipe_count = RCU_TORTURE_PIPE_LEN; |
1011 | } | 1071 | } |
1072 | if (pipe_count > 1) | ||
1073 | rcutorture_trace_dump(); | ||
1012 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1074 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
1013 | completed = cur_ops->completed() - completed; | 1075 | completed = cur_ops->completed() - completed; |
1014 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1076 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page) | |||
1056 | cnt += sprintf(&page[cnt], | 1118 | cnt += sprintf(&page[cnt], |
1057 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1119 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1058 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1120 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1059 | "rtbf: %ld rtb: %ld nt: %ld", | 1121 | "rtbf: %ld rtb: %ld nt: %ld " |
1122 | "onoff: %ld/%ld:%ld/%ld", | ||
1060 | rcu_torture_current, | 1123 | rcu_torture_current, |
1061 | rcu_torture_current_version, | 1124 | rcu_torture_current_version, |
1062 | list_empty(&rcu_torture_freelist), | 1125 | list_empty(&rcu_torture_freelist), |
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page) | |||
1068 | n_rcu_torture_boost_rterror, | 1131 | n_rcu_torture_boost_rterror, |
1069 | n_rcu_torture_boost_failure, | 1132 | n_rcu_torture_boost_failure, |
1070 | n_rcu_torture_boosts, | 1133 | n_rcu_torture_boosts, |
1071 | n_rcu_torture_timers); | 1134 | n_rcu_torture_timers, |
1135 | n_online_successes, | ||
1136 | n_online_attempts, | ||
1137 | n_offline_successes, | ||
1138 | n_offline_attempts); | ||
1072 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1139 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1073 | n_rcu_torture_boost_ktrerror != 0 || | 1140 | n_rcu_torture_boost_ktrerror != 0 || |
1074 | n_rcu_torture_boost_rterror != 0 || | 1141 | n_rcu_torture_boost_rterror != 0 || |
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1232 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1299 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1233 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1300 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1234 | "test_boost=%d/%d test_boost_interval=%d " | 1301 | "test_boost=%d/%d test_boost_interval=%d " |
1235 | "test_boost_duration=%d\n", | 1302 | "test_boost_duration=%d shutdown_secs=%d " |
1303 | "onoff_interval=%d\n", | ||
1236 | torture_type, tag, nrealreaders, nfakewriters, | 1304 | torture_type, tag, nrealreaders, nfakewriters, |
1237 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1305 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1238 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1306 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1239 | test_boost, cur_ops->can_boost, | 1307 | test_boost, cur_ops->can_boost, |
1240 | test_boost_interval, test_boost_duration); | 1308 | test_boost_interval, test_boost_duration, shutdown_secs, |
1309 | onoff_interval); | ||
1241 | } | 1310 | } |
1242 | 1311 | ||
1243 | static struct notifier_block rcutorture_shutdown_nb = { | 1312 | static struct notifier_block rcutorture_shutdown_nb = { |
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu) | |||
1287 | return 0; | 1356 | return 0; |
1288 | } | 1357 | } |
1289 | 1358 | ||
1359 | /* | ||
1360 | * Cause the rcutorture test to shutdown the system after the test has | ||
1361 | * run for the time specified by the shutdown_secs module parameter. | ||
1362 | */ | ||
1363 | static int | ||
1364 | rcu_torture_shutdown(void *arg) | ||
1365 | { | ||
1366 | long delta; | ||
1367 | unsigned long jiffies_snap; | ||
1368 | |||
1369 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); | ||
1370 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1371 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | ||
1372 | !kthread_should_stop()) { | ||
1373 | delta = shutdown_time - jiffies_snap; | ||
1374 | if (verbose) | ||
1375 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1376 | "rcu_torture_shutdown task: %lu " | ||
1377 | "jiffies remaining\n", | ||
1378 | torture_type, delta); | ||
1379 | schedule_timeout_interruptible(delta); | ||
1380 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1381 | } | ||
1382 | if (kthread_should_stop()) { | ||
1383 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); | ||
1384 | return 0; | ||
1385 | } | ||
1386 | |||
1387 | /* OK, shut down the system. */ | ||
1388 | |||
1389 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); | ||
1390 | shutdown_task = NULL; /* Avoid self-kill deadlock. */ | ||
1391 | rcu_torture_cleanup(); /* Get the success/failure message. */ | ||
1392 | kernel_power_off(); /* Shut down the system. */ | ||
1393 | return 0; | ||
1394 | } | ||
1395 | |||
1396 | #ifdef CONFIG_HOTPLUG_CPU | ||
1397 | |||
1398 | /* | ||
1399 | * Execute random CPU-hotplug operations at the interval specified | ||
1400 | * by the onoff_interval. | ||
1401 | */ | ||
1402 | static int | ||
1403 | rcu_torture_onoff(void *arg) | ||
1404 | { | ||
1405 | int cpu; | ||
1406 | int maxcpu = -1; | ||
1407 | DEFINE_RCU_RANDOM(rand); | ||
1408 | |||
1409 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | ||
1410 | for_each_online_cpu(cpu) | ||
1411 | maxcpu = cpu; | ||
1412 | WARN_ON(maxcpu < 0); | ||
1413 | while (!kthread_should_stop()) { | ||
1414 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | ||
1415 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | ||
1416 | if (verbose) | ||
1417 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1418 | "rcu_torture_onoff task: offlining %d\n", | ||
1419 | torture_type, cpu); | ||
1420 | n_offline_attempts++; | ||
1421 | if (cpu_down(cpu) == 0) { | ||
1422 | if (verbose) | ||
1423 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1424 | "rcu_torture_onoff task: " | ||
1425 | "offlined %d\n", | ||
1426 | torture_type, cpu); | ||
1427 | n_offline_successes++; | ||
1428 | } | ||
1429 | } else if (cpu_is_hotpluggable(cpu)) { | ||
1430 | if (verbose) | ||
1431 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1432 | "rcu_torture_onoff task: onlining %d\n", | ||
1433 | torture_type, cpu); | ||
1434 | n_online_attempts++; | ||
1435 | if (cpu_up(cpu) == 0) { | ||
1436 | if (verbose) | ||
1437 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1438 | "rcu_torture_onoff task: " | ||
1439 | "onlined %d\n", | ||
1440 | torture_type, cpu); | ||
1441 | n_online_successes++; | ||
1442 | } | ||
1443 | } | ||
1444 | schedule_timeout_interruptible(onoff_interval * HZ); | ||
1445 | } | ||
1446 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); | ||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1450 | static int | ||
1451 | rcu_torture_onoff_init(void) | ||
1452 | { | ||
1453 | if (onoff_interval <= 0) | ||
1454 | return 0; | ||
1455 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); | ||
1456 | if (IS_ERR(onoff_task)) { | ||
1457 | onoff_task = NULL; | ||
1458 | return PTR_ERR(onoff_task); | ||
1459 | } | ||
1460 | return 0; | ||
1461 | } | ||
1462 | |||
1463 | static void rcu_torture_onoff_cleanup(void) | ||
1464 | { | ||
1465 | if (onoff_task == NULL) | ||
1466 | return; | ||
1467 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | ||
1468 | kthread_stop(onoff_task); | ||
1469 | } | ||
1470 | |||
1471 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1472 | |||
1473 | static void | ||
1474 | rcu_torture_onoff_init(void) | ||
1475 | { | ||
1476 | } | ||
1477 | |||
1478 | static void rcu_torture_onoff_cleanup(void) | ||
1479 | { | ||
1480 | } | ||
1481 | |||
1482 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
1483 | |||
1290 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1484 | static int rcutorture_cpu_notify(struct notifier_block *self, |
1291 | unsigned long action, void *hcpu) | 1485 | unsigned long action, void *hcpu) |
1292 | { | 1486 | { |
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void) | |||
1391 | for_each_possible_cpu(i) | 1585 | for_each_possible_cpu(i) |
1392 | rcutorture_booster_cleanup(i); | 1586 | rcutorture_booster_cleanup(i); |
1393 | } | 1587 | } |
1588 | if (shutdown_task != NULL) { | ||
1589 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | ||
1590 | kthread_stop(shutdown_task); | ||
1591 | } | ||
1592 | rcu_torture_onoff_cleanup(); | ||
1394 | 1593 | ||
1395 | /* Wait for all RCU callbacks to fire. */ | 1594 | /* Wait for all RCU callbacks to fire. */ |
1396 | 1595 | ||
@@ -1416,7 +1615,7 @@ rcu_torture_init(void) | |||
1416 | static struct rcu_torture_ops *torture_ops[] = | 1615 | static struct rcu_torture_ops *torture_ops[] = |
1417 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1616 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1418 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1617 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1419 | &srcu_ops, &srcu_expedited_ops, | 1618 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, |
1420 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1619 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1421 | 1620 | ||
1422 | mutex_lock(&fullstop_mutex); | 1621 | mutex_lock(&fullstop_mutex); |
@@ -1607,6 +1806,18 @@ rcu_torture_init(void) | |||
1607 | } | 1806 | } |
1608 | } | 1807 | } |
1609 | } | 1808 | } |
1809 | if (shutdown_secs > 0) { | ||
1810 | shutdown_time = jiffies + shutdown_secs * HZ; | ||
1811 | shutdown_task = kthread_run(rcu_torture_shutdown, NULL, | ||
1812 | "rcu_torture_shutdown"); | ||
1813 | if (IS_ERR(shutdown_task)) { | ||
1814 | firsterr = PTR_ERR(shutdown_task); | ||
1815 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | ||
1816 | shutdown_task = NULL; | ||
1817 | goto unwind; | ||
1818 | } | ||
1819 | } | ||
1820 | rcu_torture_onoff_init(); | ||
1610 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1821 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1611 | rcutorture_record_test_transition(); | 1822 | rcutorture_record_test_transition(); |
1612 | mutex_unlock(&fullstop_mutex); | 1823 | mutex_unlock(&fullstop_mutex); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b76d812740c..6c4a6722abfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
69 | NUM_RCU_LVL_3, \ | 69 | NUM_RCU_LVL_3, \ |
70 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ | 70 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ |
71 | }, \ | 71 | }, \ |
72 | .signaled = RCU_GP_IDLE, \ | 72 | .fqs_state = RCU_GP_IDLE, \ |
73 | .gpnum = -300, \ | 73 | .gpnum = -300, \ |
74 | .completed = -300, \ | 74 | .completed = -300, \ |
75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu) | |||
195 | } | 195 | } |
196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
197 | 197 | ||
198 | #ifdef CONFIG_NO_HZ | ||
199 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 198 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
200 | .dynticks_nesting = 1, | 199 | .dynticks_nesting = DYNTICK_TASK_NESTING, |
201 | .dynticks = ATOMIC_INIT(1), | 200 | .dynticks = ATOMIC_INIT(1), |
202 | }; | 201 | }; |
203 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
204 | 202 | ||
205 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 203 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
206 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 204 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ |
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
328 | return 1; | 326 | return 1; |
329 | } | 327 | } |
330 | 328 | ||
331 | /* If preemptible RCU, no point in sending reschedule IPI. */ | 329 | /* |
332 | if (rdp->preemptible) | 330 | * The CPU is online, so send it a reschedule IPI. This forces |
333 | return 0; | 331 | * it through the scheduler, and (inefficiently) also handles cases |
334 | 332 | * where idle loops fail to inform RCU about the CPU being idle. | |
335 | /* The CPU is online, so send it a reschedule IPI. */ | 333 | */ |
336 | if (rdp->cpu != smp_processor_id()) | 334 | if (rdp->cpu != smp_processor_id()) |
337 | smp_send_reschedule(rdp->cpu); | 335 | smp_send_reschedule(rdp->cpu); |
338 | else | 336 | else |
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
343 | 341 | ||
344 | #endif /* #ifdef CONFIG_SMP */ | 342 | #endif /* #ifdef CONFIG_SMP */ |
345 | 343 | ||
346 | #ifdef CONFIG_NO_HZ | 344 | /* |
345 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | ||
346 | * | ||
347 | * If the new value of the ->dynticks_nesting counter now is zero, | ||
348 | * we really have entered idle, and must do the appropriate accounting. | ||
349 | * The caller must have disabled interrupts. | ||
350 | */ | ||
351 | static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | ||
352 | { | ||
353 | trace_rcu_dyntick("Start", oldval, 0); | ||
354 | if (!is_idle_task(current)) { | ||
355 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
356 | |||
357 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | ||
358 | ftrace_dump(DUMP_ALL); | ||
359 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
360 | current->pid, current->comm, | ||
361 | idle->pid, idle->comm); /* must be idle task! */ | ||
362 | } | ||
363 | rcu_prepare_for_idle(smp_processor_id()); | ||
364 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
365 | smp_mb__before_atomic_inc(); /* See above. */ | ||
366 | atomic_inc(&rdtp->dynticks); | ||
367 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
368 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
369 | } | ||
347 | 370 | ||
348 | /** | 371 | /** |
349 | * rcu_enter_nohz - inform RCU that current CPU is entering nohz | 372 | * rcu_idle_enter - inform RCU that current CPU is entering idle |
350 | * | 373 | * |
351 | * Enter nohz mode, in other words, -leave- the mode in which RCU | 374 | * Enter idle mode, in other words, -leave- the mode in which RCU |
352 | * read-side critical sections can occur. (Though RCU read-side | 375 | * read-side critical sections can occur. (Though RCU read-side |
353 | * critical sections can occur in irq handlers in nohz mode, a possibility | 376 | * critical sections can occur in irq handlers in idle, a possibility |
354 | * handled by rcu_irq_enter() and rcu_irq_exit()). | 377 | * handled by irq_enter() and irq_exit().) |
378 | * | ||
379 | * We crowbar the ->dynticks_nesting field to zero to allow for | ||
380 | * the possibility of usermode upcalls having messed up our count | ||
381 | * of interrupt nesting level during the prior busy period. | ||
355 | */ | 382 | */ |
356 | void rcu_enter_nohz(void) | 383 | void rcu_idle_enter(void) |
357 | { | 384 | { |
358 | unsigned long flags; | 385 | unsigned long flags; |
386 | long long oldval; | ||
359 | struct rcu_dynticks *rdtp; | 387 | struct rcu_dynticks *rdtp; |
360 | 388 | ||
361 | local_irq_save(flags); | 389 | local_irq_save(flags); |
362 | rdtp = &__get_cpu_var(rcu_dynticks); | 390 | rdtp = &__get_cpu_var(rcu_dynticks); |
363 | if (--rdtp->dynticks_nesting) { | 391 | oldval = rdtp->dynticks_nesting; |
364 | local_irq_restore(flags); | 392 | rdtp->dynticks_nesting = 0; |
365 | return; | 393 | rcu_idle_enter_common(rdtp, oldval); |
366 | } | ||
367 | trace_rcu_dyntick("Start"); | ||
368 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
369 | smp_mb__before_atomic_inc(); /* See above. */ | ||
370 | atomic_inc(&rdtp->dynticks); | ||
371 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
372 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
373 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
374 | } | 395 | } |
375 | 396 | ||
376 | /* | 397 | /** |
377 | * rcu_exit_nohz - inform RCU that current CPU is leaving nohz | 398 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
399 | * | ||
400 | * Exit from an interrupt handler, which might possibly result in entering | ||
401 | * idle mode, in other words, leaving the mode in which read-side critical | ||
402 | * sections can occur. | ||
378 | * | 403 | * |
379 | * Exit nohz mode, in other words, -enter- the mode in which RCU | 404 | * This code assumes that the idle loop never does anything that might |
380 | * read-side critical sections normally occur. | 405 | * result in unbalanced calls to irq_enter() and irq_exit(). If your |
406 | * architecture violates this assumption, RCU will give you what you | ||
407 | * deserve, good and hard. But very infrequently and irreproducibly. | ||
408 | * | ||
409 | * Use things like work queues to work around this limitation. | ||
410 | * | ||
411 | * You have been warned. | ||
381 | */ | 412 | */ |
382 | void rcu_exit_nohz(void) | 413 | void rcu_irq_exit(void) |
383 | { | 414 | { |
384 | unsigned long flags; | 415 | unsigned long flags; |
416 | long long oldval; | ||
385 | struct rcu_dynticks *rdtp; | 417 | struct rcu_dynticks *rdtp; |
386 | 418 | ||
387 | local_irq_save(flags); | 419 | local_irq_save(flags); |
388 | rdtp = &__get_cpu_var(rcu_dynticks); | 420 | rdtp = &__get_cpu_var(rcu_dynticks); |
389 | if (rdtp->dynticks_nesting++) { | 421 | oldval = rdtp->dynticks_nesting; |
390 | local_irq_restore(flags); | 422 | rdtp->dynticks_nesting--; |
391 | return; | 423 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
392 | } | 424 | if (rdtp->dynticks_nesting) |
425 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | ||
426 | else | ||
427 | rcu_idle_enter_common(rdtp, oldval); | ||
428 | local_irq_restore(flags); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle | ||
433 | * | ||
434 | * If the new value of the ->dynticks_nesting counter was previously zero, | ||
435 | * we really have exited idle, and must do the appropriate accounting. | ||
436 | * The caller must have disabled interrupts. | ||
437 | */ | ||
438 | static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | ||
439 | { | ||
393 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 440 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ |
394 | atomic_inc(&rdtp->dynticks); | 441 | atomic_inc(&rdtp->dynticks); |
395 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 442 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
396 | smp_mb__after_atomic_inc(); /* See above. */ | 443 | smp_mb__after_atomic_inc(); /* See above. */ |
397 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 444 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
398 | trace_rcu_dyntick("End"); | 445 | rcu_cleanup_after_idle(smp_processor_id()); |
446 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | ||
447 | if (!is_idle_task(current)) { | ||
448 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
449 | |||
450 | trace_rcu_dyntick("Error on exit: not idle task", | ||
451 | oldval, rdtp->dynticks_nesting); | ||
452 | ftrace_dump(DUMP_ALL); | ||
453 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
454 | current->pid, current->comm, | ||
455 | idle->pid, idle->comm); /* must be idle task! */ | ||
456 | } | ||
457 | } | ||
458 | |||
459 | /** | ||
460 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | ||
461 | * | ||
462 | * Exit idle mode, in other words, -enter- the mode in which RCU | ||
463 | * read-side critical sections can occur. | ||
464 | * | ||
465 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to | ||
466 | * allow for the possibility of usermode upcalls messing up our count | ||
467 | * of interrupt nesting level during the busy period that is just | ||
468 | * now starting. | ||
469 | */ | ||
470 | void rcu_idle_exit(void) | ||
471 | { | ||
472 | unsigned long flags; | ||
473 | struct rcu_dynticks *rdtp; | ||
474 | long long oldval; | ||
475 | |||
476 | local_irq_save(flags); | ||
477 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
478 | oldval = rdtp->dynticks_nesting; | ||
479 | WARN_ON_ONCE(oldval != 0); | ||
480 | rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
481 | rcu_idle_exit_common(rdtp, oldval); | ||
482 | local_irq_restore(flags); | ||
483 | } | ||
484 | |||
485 | /** | ||
486 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | ||
487 | * | ||
488 | * Enter an interrupt handler, which might possibly result in exiting | ||
489 | * idle mode, in other words, entering the mode in which read-side critical | ||
490 | * sections can occur. | ||
491 | * | ||
492 | * Note that the Linux kernel is fully capable of entering an interrupt | ||
493 | * handler that it never exits, for example when doing upcalls to | ||
494 | * user mode! This code assumes that the idle loop never does upcalls to | ||
495 | * user mode. If your architecture does do upcalls from the idle loop (or | ||
496 | * does anything else that results in unbalanced calls to the irq_enter() | ||
497 | * and irq_exit() functions), RCU will give you what you deserve, good | ||
498 | * and hard. But very infrequently and irreproducibly. | ||
499 | * | ||
500 | * Use things like work queues to work around this limitation. | ||
501 | * | ||
502 | * You have been warned. | ||
503 | */ | ||
504 | void rcu_irq_enter(void) | ||
505 | { | ||
506 | unsigned long flags; | ||
507 | struct rcu_dynticks *rdtp; | ||
508 | long long oldval; | ||
509 | |||
510 | local_irq_save(flags); | ||
511 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
512 | oldval = rdtp->dynticks_nesting; | ||
513 | rdtp->dynticks_nesting++; | ||
514 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | ||
515 | if (oldval) | ||
516 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | ||
517 | else | ||
518 | rcu_idle_exit_common(rdtp, oldval); | ||
399 | local_irq_restore(flags); | 519 | local_irq_restore(flags); |
400 | } | 520 | } |
401 | 521 | ||
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void) | |||
442 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 562 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
443 | } | 563 | } |
444 | 564 | ||
565 | #ifdef CONFIG_PROVE_RCU | ||
566 | |||
445 | /** | 567 | /** |
446 | * rcu_irq_enter - inform RCU of entry to hard irq context | 568 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle |
447 | * | 569 | * |
448 | * If the CPU was idle with dynamic ticks active, this updates the | 570 | * If the current CPU is in its idle loop and is neither in an interrupt |
449 | * rdtp->dynticks to let the RCU handling know that the CPU is active. | 571 | * or NMI handler, return true. |
450 | */ | 572 | */ |
451 | void rcu_irq_enter(void) | 573 | int rcu_is_cpu_idle(void) |
452 | { | 574 | { |
453 | rcu_exit_nohz(); | 575 | int ret; |
576 | |||
577 | preempt_disable(); | ||
578 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | ||
579 | preempt_enable(); | ||
580 | return ret; | ||
454 | } | 581 | } |
582 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
583 | |||
584 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
455 | 585 | ||
456 | /** | 586 | /** |
457 | * rcu_irq_exit - inform RCU of exit from hard irq context | 587 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle |
458 | * | 588 | * |
459 | * If the CPU was idle with dynamic ticks active, update the rdp->dynticks | 589 | * If the current CPU is idle or running at a first-level (not nested) |
460 | * to put let the RCU handling be aware that the CPU is going back to idle | 590 | * interrupt from idle, return true. The caller must have at least |
461 | * with no ticks. | 591 | * disabled preemption. |
462 | */ | 592 | */ |
463 | void rcu_irq_exit(void) | 593 | int rcu_is_cpu_rrupt_from_idle(void) |
464 | { | 594 | { |
465 | rcu_enter_nohz(); | 595 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; |
466 | } | 596 | } |
467 | 597 | ||
468 | #ifdef CONFIG_SMP | 598 | #ifdef CONFIG_SMP |
@@ -475,7 +605,7 @@ void rcu_irq_exit(void) | |||
475 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 605 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
476 | { | 606 | { |
477 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 607 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
478 | return 0; | 608 | return (rdp->dynticks_snap & 0x1) == 0; |
479 | } | 609 | } |
480 | 610 | ||
481 | /* | 611 | /* |
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
512 | 642 | ||
513 | #endif /* #ifdef CONFIG_SMP */ | 643 | #endif /* #ifdef CONFIG_SMP */ |
514 | 644 | ||
515 | #else /* #ifdef CONFIG_NO_HZ */ | ||
516 | |||
517 | #ifdef CONFIG_SMP | ||
518 | |||
519 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | ||
520 | { | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | ||
525 | { | ||
526 | return rcu_implicit_offline_qs(rdp); | ||
527 | } | ||
528 | |||
529 | #endif /* #ifdef CONFIG_SMP */ | ||
530 | |||
531 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
532 | |||
533 | int rcu_cpu_stall_suppress __read_mostly; | ||
534 | |||
535 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 645 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
536 | { | 646 | { |
537 | rsp->gp_start = jiffies; | 647 | rsp->gp_start = jiffies; |
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
866 | /* Advance to a new grace period and initialize state. */ | 976 | /* Advance to a new grace period and initialize state. */ |
867 | rsp->gpnum++; | 977 | rsp->gpnum++; |
868 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 978 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); |
869 | WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); | 979 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); |
870 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | 980 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ |
871 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 981 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
872 | record_gp_stall_check_time(rsp); | 982 | record_gp_stall_check_time(rsp); |
873 | 983 | ||
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
877 | rnp->qsmask = rnp->qsmaskinit; | 987 | rnp->qsmask = rnp->qsmaskinit; |
878 | rnp->gpnum = rsp->gpnum; | 988 | rnp->gpnum = rsp->gpnum; |
879 | rnp->completed = rsp->completed; | 989 | rnp->completed = rsp->completed; |
880 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 990 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ |
881 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 991 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
882 | rcu_preempt_boost_start_gp(rnp); | 992 | rcu_preempt_boost_start_gp(rnp); |
883 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 993 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
927 | 1037 | ||
928 | rnp = rcu_get_root(rsp); | 1038 | rnp = rcu_get_root(rsp); |
929 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1039 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
930 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 1040 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
931 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1041 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
932 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1042 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
933 | } | 1043 | } |
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
991 | 1101 | ||
992 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | 1102 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ |
993 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1103 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
994 | rsp->signaled = RCU_GP_IDLE; | 1104 | rsp->fqs_state = RCU_GP_IDLE; |
995 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 1105 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
996 | } | 1106 | } |
997 | 1107 | ||
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1221 | else | 1331 | else |
1222 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1332 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1223 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1333 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1224 | rcu_report_exp_rnp(rsp, rnp); | 1334 | rcu_report_exp_rnp(rsp, rnp, true); |
1225 | rcu_node_kthread_setaffinity(rnp, -1); | 1335 | rcu_node_kthread_setaffinity(rnp, -1); |
1226 | } | 1336 | } |
1227 | 1337 | ||
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1263 | /* If no callbacks are ready, just return.*/ | 1373 | /* If no callbacks are ready, just return.*/ |
1264 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1374 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1265 | trace_rcu_batch_start(rsp->name, 0, 0); | 1375 | trace_rcu_batch_start(rsp->name, 0, 0); |
1266 | trace_rcu_batch_end(rsp->name, 0); | 1376 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), |
1377 | need_resched(), is_idle_task(current), | ||
1378 | rcu_is_callbacks_kthread()); | ||
1267 | return; | 1379 | return; |
1268 | } | 1380 | } |
1269 | 1381 | ||
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1291 | debug_rcu_head_unqueue(list); | 1403 | debug_rcu_head_unqueue(list); |
1292 | __rcu_reclaim(rsp->name, list); | 1404 | __rcu_reclaim(rsp->name, list); |
1293 | list = next; | 1405 | list = next; |
1294 | if (++count >= bl) | 1406 | /* Stop only if limit reached and CPU has something to do. */ |
1407 | if (++count >= bl && | ||
1408 | (need_resched() || | ||
1409 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | ||
1295 | break; | 1410 | break; |
1296 | } | 1411 | } |
1297 | 1412 | ||
1298 | local_irq_save(flags); | 1413 | local_irq_save(flags); |
1299 | trace_rcu_batch_end(rsp->name, count); | 1414 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), |
1415 | is_idle_task(current), | ||
1416 | rcu_is_callbacks_kthread()); | ||
1300 | 1417 | ||
1301 | /* Update count, and requeue any remaining callbacks. */ | 1418 | /* Update count, and requeue any remaining callbacks. */ |
1302 | rdp->qlen -= count; | 1419 | rdp->qlen -= count; |
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1334 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). | 1451 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). |
1335 | * Also schedule RCU core processing. | 1452 | * Also schedule RCU core processing. |
1336 | * | 1453 | * |
1337 | * This function must be called with hardirqs disabled. It is normally | 1454 | * This function must be called from hardirq context. It is normally |
1338 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 1455 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
1339 | * false, there is no point in invoking rcu_check_callbacks(). | 1456 | * false, there is no point in invoking rcu_check_callbacks(). |
1340 | */ | 1457 | */ |
1341 | void rcu_check_callbacks(int cpu, int user) | 1458 | void rcu_check_callbacks(int cpu, int user) |
1342 | { | 1459 | { |
1343 | trace_rcu_utilization("Start scheduler-tick"); | 1460 | trace_rcu_utilization("Start scheduler-tick"); |
1344 | if (user || | 1461 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
1345 | (idle_cpu(cpu) && rcu_scheduler_active && | ||
1346 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
1347 | 1462 | ||
1348 | /* | 1463 | /* |
1349 | * Get here if this CPU took its interrupt from user | 1464 | * Get here if this CPU took its interrupt from user |
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1457 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ | 1572 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
1458 | } | 1573 | } |
1459 | rsp->fqs_active = 1; | 1574 | rsp->fqs_active = 1; |
1460 | switch (rsp->signaled) { | 1575 | switch (rsp->fqs_state) { |
1461 | case RCU_GP_IDLE: | 1576 | case RCU_GP_IDLE: |
1462 | case RCU_GP_INIT: | 1577 | case RCU_GP_INIT: |
1463 | 1578 | ||
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1473 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1588 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
1474 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 1589 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1475 | if (rcu_gp_in_progress(rsp)) | 1590 | if (rcu_gp_in_progress(rsp)) |
1476 | rsp->signaled = RCU_FORCE_QS; | 1591 | rsp->fqs_state = RCU_FORCE_QS; |
1477 | break; | 1592 | break; |
1478 | 1593 | ||
1479 | case RCU_FORCE_QS: | 1594 | case RCU_FORCE_QS: |
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu) | |||
1812 | * by the current CPU, even if none need be done immediately, returning | 1927 | * by the current CPU, even if none need be done immediately, returning |
1813 | * 1 if so. | 1928 | * 1 if so. |
1814 | */ | 1929 | */ |
1815 | static int rcu_needs_cpu_quick_check(int cpu) | 1930 | static int rcu_cpu_has_callbacks(int cpu) |
1816 | { | 1931 | { |
1817 | /* RCU callbacks either ready or pending? */ | 1932 | /* RCU callbacks either ready or pending? */ |
1818 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1933 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1913 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 2028 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1914 | rdp->nxttail[i] = &rdp->nxtlist; | 2029 | rdp->nxttail[i] = &rdp->nxtlist; |
1915 | rdp->qlen = 0; | 2030 | rdp->qlen = 0; |
1916 | #ifdef CONFIG_NO_HZ | ||
1917 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2031 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1918 | #endif /* #ifdef CONFIG_NO_HZ */ | 2032 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); |
2033 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | ||
1919 | rdp->cpu = cpu; | 2034 | rdp->cpu = cpu; |
1920 | rdp->rsp = rsp; | 2035 | rdp->rsp = rsp; |
1921 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2036 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1942 | rdp->qlen_last_fqs_check = 0; | 2057 | rdp->qlen_last_fqs_check = 0; |
1943 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2058 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1944 | rdp->blimit = blimit; | 2059 | rdp->blimit = blimit; |
2060 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
2061 | atomic_set(&rdp->dynticks->dynticks, | ||
2062 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | ||
2063 | rcu_prepare_for_idle_init(cpu); | ||
1945 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2064 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1946 | 2065 | ||
1947 | /* | 2066 | /* |
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2023 | rcu_send_cbs_to_online(&rcu_bh_state); | 2142 | rcu_send_cbs_to_online(&rcu_bh_state); |
2024 | rcu_send_cbs_to_online(&rcu_sched_state); | 2143 | rcu_send_cbs_to_online(&rcu_sched_state); |
2025 | rcu_preempt_send_cbs_to_online(); | 2144 | rcu_preempt_send_cbs_to_online(); |
2145 | rcu_cleanup_after_idle(cpu); | ||
2026 | break; | 2146 | break; |
2027 | case CPU_DEAD: | 2147 | case CPU_DEAD: |
2028 | case CPU_DEAD_FROZEN: | 2148 | case CPU_DEAD_FROZEN: |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 849ce9ec51fe..fddff92d6676 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,9 +84,10 @@ | |||
84 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
85 | */ | 85 | */ |
86 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
87 | int dynticks_nesting; /* Track irq/process nesting level. */ | 87 | long long dynticks_nesting; /* Track irq/process nesting level. */ |
88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | ||
90 | }; | 91 | }; |
91 | 92 | ||
92 | /* RCU's kthread states for tracing. */ | 93 | /* RCU's kthread states for tracing. */ |
@@ -274,16 +275,12 @@ struct rcu_data { | |||
274 | /* did other CPU force QS recently? */ | 275 | /* did other CPU force QS recently? */ |
275 | long blimit; /* Upper limit on a processed batch */ | 276 | long blimit; /* Upper limit on a processed batch */ |
276 | 277 | ||
277 | #ifdef CONFIG_NO_HZ | ||
278 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
281 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
282 | 281 | ||
283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 282 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
284 | #ifdef CONFIG_NO_HZ | ||
285 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 283 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
286 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
287 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 284 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
288 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 285 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
289 | 286 | ||
@@ -302,16 +299,12 @@ struct rcu_data { | |||
302 | struct rcu_state *rsp; | 299 | struct rcu_state *rsp; |
303 | }; | 300 | }; |
304 | 301 | ||
305 | /* Values for signaled field in struct rcu_state. */ | 302 | /* Values for fqs_state field in struct rcu_state. */ |
306 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 303 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
307 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 304 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
308 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 305 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
309 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 306 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
310 | #ifdef CONFIG_NO_HZ | ||
311 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 307 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
312 | #else /* #ifdef CONFIG_NO_HZ */ | ||
313 | #define RCU_SIGNAL_INIT RCU_FORCE_QS | ||
314 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
315 | 308 | ||
316 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 309 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
317 | 310 | ||
@@ -361,7 +354,7 @@ struct rcu_state { | |||
361 | 354 | ||
362 | /* The following fields are guarded by the root rcu_node's lock. */ | 355 | /* The following fields are guarded by the root rcu_node's lock. */ |
363 | 356 | ||
364 | u8 signaled ____cacheline_internodealigned_in_smp; | 357 | u8 fqs_state ____cacheline_internodealigned_in_smp; |
365 | /* Force QS state. */ | 358 | /* Force QS state. */ |
366 | u8 fqs_active; /* force_quiescent_state() */ | 359 | u8 fqs_active; /* force_quiescent_state() */ |
367 | /* is running. */ | 360 | /* is running. */ |
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu); | |||
451 | static void rcu_preempt_process_callbacks(void); | 444 | static void rcu_preempt_process_callbacks(void); |
452 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 445 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
453 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 446 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) |
454 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | 447 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
448 | bool wake); | ||
455 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 449 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
456 | static int rcu_preempt_pending(int cpu); | 450 | static int rcu_preempt_pending(int cpu); |
457 | static int rcu_preempt_needs_cpu(int cpu); | 451 | static int rcu_preempt_needs_cpu(int cpu); |
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void); | |||
461 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 455 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
462 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 456 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
463 | static void invoke_rcu_callbacks_kthread(void); | 457 | static void invoke_rcu_callbacks_kthread(void); |
458 | static bool rcu_is_callbacks_kthread(void); | ||
464 | #ifdef CONFIG_RCU_BOOST | 459 | #ifdef CONFIG_RCU_BOOST |
465 | static void rcu_preempt_do_callbacks(void); | 460 | static void rcu_preempt_do_callbacks(void); |
466 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | 461 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, |
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | |||
473 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 468 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
474 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | 469 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); |
475 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 470 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
471 | static void rcu_prepare_for_idle_init(int cpu); | ||
472 | static void rcu_cleanup_after_idle(int cpu); | ||
473 | static void rcu_prepare_for_idle(int cpu); | ||
476 | 474 | ||
477 | #endif /* #ifndef RCU_TREE_NONCORE */ | 475 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b9b9f8a4184..8bb35d73e1f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
312 | { | 312 | { |
313 | int empty; | 313 | int empty; |
314 | int empty_exp; | 314 | int empty_exp; |
315 | int empty_exp_now; | ||
315 | unsigned long flags; | 316 | unsigned long flags; |
316 | struct list_head *np; | 317 | struct list_head *np; |
317 | #ifdef CONFIG_RCU_BOOST | 318 | #ifdef CONFIG_RCU_BOOST |
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
382 | /* | 383 | /* |
383 | * If this was the last task on the current list, and if | 384 | * If this was the last task on the current list, and if |
384 | * we aren't waiting on any CPUs, report the quiescent state. | 385 | * we aren't waiting on any CPUs, report the quiescent state. |
385 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 386 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, |
387 | * so we must take a snapshot of the expedited state. | ||
386 | */ | 388 | */ |
389 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | ||
387 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 390 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
388 | trace_rcu_quiescent_state_report("preempt_rcu", | 391 | trace_rcu_quiescent_state_report("preempt_rcu", |
389 | rnp->gpnum, | 392 | rnp->gpnum, |
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
406 | * If this was the last task on the expedited lists, | 409 | * If this was the last task on the expedited lists, |
407 | * then we need to report up the rcu_node hierarchy. | 410 | * then we need to report up the rcu_node hierarchy. |
408 | */ | 411 | */ |
409 | if (!empty_exp && !rcu_preempted_readers_exp(rnp)) | 412 | if (!empty_exp && empty_exp_now) |
410 | rcu_report_exp_rnp(&rcu_preempt_state, rnp); | 413 | rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); |
411 | } else { | 414 | } else { |
412 | local_irq_restore(flags); | 415 | local_irq_restore(flags); |
413 | } | 416 | } |
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
729 | * recursively up the tree. (Calm down, calm down, we do the recursion | 732 | * recursively up the tree. (Calm down, calm down, we do the recursion |
730 | * iteratively!) | 733 | * iteratively!) |
731 | * | 734 | * |
735 | * Most callers will set the "wake" flag, but the task initiating the | ||
736 | * expedited grace period need not wake itself. | ||
737 | * | ||
732 | * Caller must hold sync_rcu_preempt_exp_mutex. | 738 | * Caller must hold sync_rcu_preempt_exp_mutex. |
733 | */ | 739 | */ |
734 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 740 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
741 | bool wake) | ||
735 | { | 742 | { |
736 | unsigned long flags; | 743 | unsigned long flags; |
737 | unsigned long mask; | 744 | unsigned long mask; |
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
744 | } | 751 | } |
745 | if (rnp->parent == NULL) { | 752 | if (rnp->parent == NULL) { |
746 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 753 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
747 | wake_up(&sync_rcu_preempt_exp_wq); | 754 | if (wake) |
755 | wake_up(&sync_rcu_preempt_exp_wq); | ||
748 | break; | 756 | break; |
749 | } | 757 | } |
750 | mask = rnp->grpmask; | 758 | mask = rnp->grpmask; |
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
777 | must_wait = 1; | 785 | must_wait = 1; |
778 | } | 786 | } |
779 | if (!must_wait) | 787 | if (!must_wait) |
780 | rcu_report_exp_rnp(rsp, rnp); | 788 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ |
781 | } | 789 | } |
782 | 790 | ||
783 | /* | 791 | /* |
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
1069 | * report on tasks preempted in RCU read-side critical sections during | 1077 | * report on tasks preempted in RCU read-side critical sections during |
1070 | * expedited RCU grace periods. | 1078 | * expedited RCU grace periods. |
1071 | */ | 1079 | */ |
1072 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 1080 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
1081 | bool wake) | ||
1073 | { | 1082 | { |
1074 | return; | ||
1075 | } | 1083 | } |
1076 | 1084 | ||
1077 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1085 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) | |||
1157 | 1165 | ||
1158 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 1166 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
1159 | 1167 | ||
1160 | static struct lock_class_key rcu_boost_class; | ||
1161 | |||
1162 | /* | 1168 | /* |
1163 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 1169 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
1164 | * or ->boost_tasks, advancing the pointer to the next task in the | 1170 | * or ->boost_tasks, advancing the pointer to the next task in the |
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1221 | */ | 1227 | */ |
1222 | t = container_of(tb, struct task_struct, rcu_node_entry); | 1228 | t = container_of(tb, struct task_struct, rcu_node_entry); |
1223 | rt_mutex_init_proxy_locked(&mtx, t); | 1229 | rt_mutex_init_proxy_locked(&mtx, t); |
1224 | /* Avoid lockdep false positives. This rt_mutex is its own thing. */ | ||
1225 | lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, | ||
1226 | "rcu_boost_mutex"); | ||
1227 | t->rcu_boost_mutex = &mtx; | 1230 | t->rcu_boost_mutex = &mtx; |
1228 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1231 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1229 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | 1232 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ |
1230 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 1233 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
1231 | 1234 | ||
1232 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | 1235 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || |
1236 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | ||
1233 | } | 1237 | } |
1234 | 1238 | ||
1235 | /* | 1239 | /* |
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1329 | } | 1333 | } |
1330 | 1334 | ||
1331 | /* | 1335 | /* |
1336 | * Is the current CPU running the RCU-callbacks kthread? | ||
1337 | * Caller must have preemption disabled. | ||
1338 | */ | ||
1339 | static bool rcu_is_callbacks_kthread(void) | ||
1340 | { | ||
1341 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1332 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | 1345 | * Set the affinity of the boost kthread. The CPU-hotplug locks are |
1333 | * held, so no one should be messing with the existence of the boost | 1346 | * held, so no one should be messing with the existence of the boost |
1334 | * kthread. | 1347 | * kthread. |
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1772 | WARN_ON_ONCE(1); | 1785 | WARN_ON_ONCE(1); |
1773 | } | 1786 | } |
1774 | 1787 | ||
1788 | static bool rcu_is_callbacks_kthread(void) | ||
1789 | { | ||
1790 | return false; | ||
1791 | } | ||
1792 | |||
1775 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | 1793 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) |
1776 | { | 1794 | { |
1777 | } | 1795 | } |
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void) | |||
1907 | * grace period works for us. | 1925 | * grace period works for us. |
1908 | */ | 1926 | */ |
1909 | get_online_cpus(); | 1927 | get_online_cpus(); |
1910 | snap = atomic_read(&sync_sched_expedited_started) - 1; | 1928 | snap = atomic_read(&sync_sched_expedited_started); |
1911 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 1929 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
1912 | } | 1930 | } |
1913 | 1931 | ||
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |||
1939 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1957 | * 1 if so. This function is part of the RCU implementation; it is -not- |
1940 | * an exported member of the RCU API. | 1958 | * an exported member of the RCU API. |
1941 | * | 1959 | * |
1942 | * Because we have preemptible RCU, just check whether this CPU needs | 1960 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1943 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | 1961 | * any flavor of RCU. |
1944 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
1945 | */ | 1962 | */ |
1946 | int rcu_needs_cpu(int cpu) | 1963 | int rcu_needs_cpu(int cpu) |
1947 | { | 1964 | { |
1948 | return rcu_needs_cpu_quick_check(cpu); | 1965 | return rcu_cpu_has_callbacks(cpu); |
1966 | } | ||
1967 | |||
1968 | /* | ||
1969 | * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. | ||
1970 | */ | ||
1971 | static void rcu_prepare_for_idle_init(int cpu) | ||
1972 | { | ||
1973 | } | ||
1974 | |||
1975 | /* | ||
1976 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | ||
1977 | * after it. | ||
1978 | */ | ||
1979 | static void rcu_cleanup_after_idle(int cpu) | ||
1980 | { | ||
1981 | } | ||
1982 | |||
1983 | /* | ||
1984 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, | ||
1985 | * is nothing. | ||
1986 | */ | ||
1987 | static void rcu_prepare_for_idle(int cpu) | ||
1988 | { | ||
1949 | } | 1989 | } |
1950 | 1990 | ||
1951 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1991 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1952 | 1992 | ||
1953 | #define RCU_NEEDS_CPU_FLUSHES 5 | 1993 | /* |
1994 | * This code is invoked when a CPU goes idle, at which point we want | ||
1995 | * to have the CPU do everything required for RCU so that it can enter | ||
1996 | * the energy-efficient dyntick-idle mode. This is handled by a | ||
1997 | * state machine implemented by rcu_prepare_for_idle() below. | ||
1998 | * | ||
1999 | * The following three proprocessor symbols control this state machine: | ||
2000 | * | ||
2001 | * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt | ||
2002 | * to satisfy RCU. Beyond this point, it is better to incur a periodic | ||
2003 | * scheduling-clock interrupt than to loop through the state machine | ||
2004 | * at full power. | ||
2005 | * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are | ||
2006 | * optional if RCU does not need anything immediately from this | ||
2007 | * CPU, even if this CPU still has RCU callbacks queued. The first | ||
2008 | * times through the state machine are mandatory: we need to give | ||
2009 | * the state machine a chance to communicate a quiescent state | ||
2010 | * to the RCU core. | ||
2011 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | ||
2012 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | ||
2013 | * is sized to be roughly one RCU grace period. Those energy-efficiency | ||
2014 | * benchmarkers who might otherwise be tempted to set this to a large | ||
2015 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | ||
2016 | * system. And if you are -that- concerned about energy efficiency, | ||
2017 | * just power the system down and be done with it! | ||
2018 | * | ||
2019 | * The values below work well in practice. If future workloads require | ||
2020 | * adjustment, they can be converted into kernel config parameters, though | ||
2021 | * making the state machine smarter might be a better option. | ||
2022 | */ | ||
2023 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | ||
2024 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | ||
2025 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | ||
2026 | |||
1954 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 2027 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1955 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 2028 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
2029 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | ||
2030 | static ktime_t rcu_idle_gp_wait; | ||
1956 | 2031 | ||
1957 | /* | 2032 | /* |
1958 | * Check to see if any future RCU-related work will need to be done | 2033 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
1959 | * by the current CPU, even if none need be done immediately, returning | 2034 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter |
1960 | * 1 if so. This function is part of the RCU implementation; it is -not- | 2035 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to |
1961 | * an exported member of the RCU API. | 2036 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed |
2037 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2038 | * it is better to incur scheduling-clock interrupts than to spin | ||
2039 | * continuously for the same time duration! | ||
2040 | */ | ||
2041 | int rcu_needs_cpu(int cpu) | ||
2042 | { | ||
2043 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2044 | if (!rcu_cpu_has_callbacks(cpu)) | ||
2045 | return 0; | ||
2046 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
2047 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
2048 | } | ||
2049 | |||
2050 | /* | ||
2051 | * Timer handler used to force CPU to start pushing its remaining RCU | ||
2052 | * callbacks in the case where it entered dyntick-idle mode with callbacks | ||
2053 | * pending. The hander doesn't really need to do anything because the | ||
2054 | * real work is done upon re-entry to idle, or by the next scheduling-clock | ||
2055 | * interrupt should idle not be re-entered. | ||
2056 | */ | ||
2057 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | ||
2058 | { | ||
2059 | trace_rcu_prep_idle("Timer"); | ||
2060 | return HRTIMER_NORESTART; | ||
2061 | } | ||
2062 | |||
2063 | /* | ||
2064 | * Initialize the timer used to pull CPUs out of dyntick-idle mode. | ||
2065 | */ | ||
2066 | static void rcu_prepare_for_idle_init(int cpu) | ||
2067 | { | ||
2068 | static int firsttime = 1; | ||
2069 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2070 | |||
2071 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2072 | hrtp->function = rcu_idle_gp_timer_func; | ||
2073 | if (firsttime) { | ||
2074 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2075 | |||
2076 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2077 | firsttime = 0; | ||
2078 | } | ||
2079 | } | ||
2080 | |||
2081 | /* | ||
2082 | * Clean up for exit from idle. Because we are exiting from idle, there | ||
2083 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | ||
2084 | * do nothing if this timer is not active, so just cancel it unconditionally. | ||
2085 | */ | ||
2086 | static void rcu_cleanup_after_idle(int cpu) | ||
2087 | { | ||
2088 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | ||
2089 | } | ||
2090 | |||
2091 | /* | ||
2092 | * Check to see if any RCU-related work can be done by the current CPU, | ||
2093 | * and if so, schedule a softirq to get it done. This function is part | ||
2094 | * of the RCU implementation; it is -not- an exported member of the RCU API. | ||
1962 | * | 2095 | * |
1963 | * Because we are not supporting preemptible RCU, attempt to accelerate | 2096 | * The idea is for the current CPU to clear out all work required by the |
1964 | * any current grace periods so that RCU no longer needs this CPU, but | 2097 | * RCU core for the current grace period, so that this CPU can be permitted |
1965 | * only if all other CPUs are already in dynticks-idle mode. This will | 2098 | * to enter dyntick-idle mode. In some cases, it will need to be awakened |
1966 | * allow the CPU cores to be powered down immediately, as opposed to after | 2099 | * at the end of the grace period by whatever CPU ends the grace period. |
1967 | * waiting many milliseconds for grace periods to elapse. | 2100 | * This allows CPUs to go dyntick-idle more quickly, and to reduce the |
2101 | * number of wakeups by a modest integer factor. | ||
1968 | * | 2102 | * |
1969 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2103 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1970 | * disabled, we do one pass of force_quiescent_state(), then do a | 2104 | * disabled, we do one pass of force_quiescent_state(), then do a |
1971 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2105 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
1972 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2106 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
2107 | * | ||
2108 | * The caller must have disabled interrupts. | ||
1973 | */ | 2109 | */ |
1974 | int rcu_needs_cpu(int cpu) | 2110 | static void rcu_prepare_for_idle(int cpu) |
1975 | { | 2111 | { |
1976 | int c = 0; | 2112 | unsigned long flags; |
1977 | int snap; | 2113 | |
1978 | int thatcpu; | 2114 | local_irq_save(flags); |
1979 | 2115 | ||
1980 | /* Check for being in the holdoff period. */ | 2116 | /* |
1981 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) | 2117 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
1982 | return rcu_needs_cpu_quick_check(cpu); | 2118 | * Also reset state to avoid prejudicing later attempts. |
1983 | 2119 | */ | |
1984 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | 2120 | if (!rcu_cpu_has_callbacks(cpu)) { |
1985 | for_each_online_cpu(thatcpu) { | 2121 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
1986 | if (thatcpu == cpu) | 2122 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
1987 | continue; | 2123 | local_irq_restore(flags); |
1988 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, | 2124 | trace_rcu_prep_idle("No callbacks"); |
1989 | thatcpu).dynticks); | 2125 | return; |
1990 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 2126 | } |
1991 | if ((snap & 0x1) != 0) { | 2127 | |
1992 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2128 | /* |
1993 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2129 | * If in holdoff mode, just return. We will presumably have |
1994 | return rcu_needs_cpu_quick_check(cpu); | 2130 | * refrained from disabling the scheduling-clock tick. |
1995 | } | 2131 | */ |
2132 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | ||
2133 | local_irq_restore(flags); | ||
2134 | trace_rcu_prep_idle("In holdoff"); | ||
2135 | return; | ||
1996 | } | 2136 | } |
1997 | 2137 | ||
1998 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2138 | /* Check and update the rcu_dyntick_drain sequencing. */ |
1999 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2139 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2000 | /* First time through, initialize the counter. */ | 2140 | /* First time through, initialize the counter. */ |
2001 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | 2141 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; |
2142 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | ||
2143 | !rcu_pending(cpu)) { | ||
2144 | /* Can we go dyntick-idle despite still having callbacks? */ | ||
2145 | trace_rcu_prep_idle("Dyntick with callbacks"); | ||
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
2147 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
2148 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | ||
2149 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | ||
2150 | return; /* Nothing more to do immediately. */ | ||
2002 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2151 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2003 | /* We have hit the limit, so time to give up. */ | 2152 | /* We have hit the limit, so time to give up. */ |
2004 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2153 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2005 | return rcu_needs_cpu_quick_check(cpu); | 2154 | local_irq_restore(flags); |
2155 | trace_rcu_prep_idle("Begin holdoff"); | ||
2156 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | ||
2157 | return; | ||
2006 | } | 2158 | } |
2007 | 2159 | ||
2008 | /* Do one step pushing remaining RCU callbacks through. */ | 2160 | /* |
2161 | * Do one step of pushing the remaining RCU callbacks through | ||
2162 | * the RCU core state machine. | ||
2163 | */ | ||
2164 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
2165 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | ||
2166 | local_irq_restore(flags); | ||
2167 | rcu_preempt_qs(cpu); | ||
2168 | force_quiescent_state(&rcu_preempt_state, 0); | ||
2169 | local_irq_save(flags); | ||
2170 | } | ||
2171 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2009 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 2172 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
2173 | local_irq_restore(flags); | ||
2010 | rcu_sched_qs(cpu); | 2174 | rcu_sched_qs(cpu); |
2011 | force_quiescent_state(&rcu_sched_state, 0); | 2175 | force_quiescent_state(&rcu_sched_state, 0); |
2012 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | 2176 | local_irq_save(flags); |
2013 | } | 2177 | } |
2014 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 2178 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
2179 | local_irq_restore(flags); | ||
2015 | rcu_bh_qs(cpu); | 2180 | rcu_bh_qs(cpu); |
2016 | force_quiescent_state(&rcu_bh_state, 0); | 2181 | force_quiescent_state(&rcu_bh_state, 0); |
2017 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | 2182 | local_irq_save(flags); |
2018 | } | 2183 | } |
2019 | 2184 | ||
2020 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 2185 | /* |
2021 | if (c) | 2186 | * If RCU callbacks are still pending, RCU still needs this CPU. |
2187 | * So try forcing the callbacks through the grace period. | ||
2188 | */ | ||
2189 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2190 | local_irq_restore(flags); | ||
2191 | trace_rcu_prep_idle("More callbacks"); | ||
2022 | invoke_rcu_core(); | 2192 | invoke_rcu_core(); |
2023 | return c; | 2193 | } else { |
2194 | local_irq_restore(flags); | ||
2195 | trace_rcu_prep_idle("Callbacks drained"); | ||
2196 | } | ||
2024 | } | 2197 | } |
2025 | 2198 | ||
2026 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2199 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9feffa4c0695..654cfe67f0d1 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
67 | rdp->completed, rdp->gpnum, | 67 | rdp->completed, rdp->gpnum, |
68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
69 | rdp->qs_pending); | 69 | rdp->qs_pending); |
70 | #ifdef CONFIG_NO_HZ | 70 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
71 | seq_printf(m, " dt=%d/%d/%d df=%lu", | ||
72 | atomic_read(&rdp->dynticks->dynticks), | 71 | atomic_read(&rdp->dynticks->dynticks), |
73 | rdp->dynticks->dynticks_nesting, | 72 | rdp->dynticks->dynticks_nesting, |
74 | rdp->dynticks->dynticks_nmi_nesting, | 73 | rdp->dynticks->dynticks_nmi_nesting, |
75 | rdp->dynticks_fqs); | 74 | rdp->dynticks_fqs); |
76 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
77 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 75 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
78 | seq_printf(m, " ql=%ld qs=%c%c%c%c", | 76 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
79 | rdp->qlen, | 77 | rdp->qlen, |
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
141 | rdp->completed, rdp->gpnum, | 139 | rdp->completed, rdp->gpnum, |
142 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 140 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
143 | rdp->qs_pending); | 141 | rdp->qs_pending); |
144 | #ifdef CONFIG_NO_HZ | 142 | seq_printf(m, ",%d,%llx,%d,%lu", |
145 | seq_printf(m, ",%d,%d,%d,%lu", | ||
146 | atomic_read(&rdp->dynticks->dynticks), | 143 | atomic_read(&rdp->dynticks->dynticks), |
147 | rdp->dynticks->dynticks_nesting, | 144 | rdp->dynticks->dynticks_nesting, |
148 | rdp->dynticks->dynticks_nmi_nesting, | 145 | rdp->dynticks->dynticks_nmi_nesting, |
149 | rdp->dynticks_fqs); | 146 | rdp->dynticks_fqs); |
150 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
151 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 147 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
152 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, | 148 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
153 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 149 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
171 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 167 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
172 | { | 168 | { |
173 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
174 | #ifdef CONFIG_NO_HZ | ||
175 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
176 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
177 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); | 171 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
178 | #ifdef CONFIG_RCU_BOOST | 172 | #ifdef CONFIG_RCU_BOOST |
179 | seq_puts(m, "\"kt\",\"ktl\""); | 173 | seq_puts(m, "\"kt\",\"ktl\""); |
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
278 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
279 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
280 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
281 | rsp->completed, gpnum, rsp->signaled, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
282 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
283 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
284 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 8eafd1bd273e..16502d3a71c8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
101 | 101 | ||
102 | printk("\n============================================\n"); | 102 | printk("\n============================================\n"); |
103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); |
104 | printk("%s\n", print_tainted()); | ||
104 | printk( "--------------------------------------------\n"); | 105 | printk( "--------------------------------------------\n"); |
105 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 106 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
106 | task->comm, task_pid_nr(task), | 107 | task->comm, task_pid_nr(task), |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index f9d8482dd487..a242e691c993 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
579 | struct rt_mutex_waiter *waiter) | 579 | struct rt_mutex_waiter *waiter) |
580 | { | 580 | { |
581 | int ret = 0; | 581 | int ret = 0; |
582 | int was_disabled; | ||
583 | 582 | ||
584 | for (;;) { | 583 | for (;;) { |
585 | /* Try to acquire the lock: */ | 584 | /* Try to acquire the lock: */ |
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
602 | 601 | ||
603 | raw_spin_unlock(&lock->wait_lock); | 602 | raw_spin_unlock(&lock->wait_lock); |
604 | 603 | ||
605 | was_disabled = irqs_disabled(); | ||
606 | if (was_disabled) | ||
607 | local_irq_enable(); | ||
608 | |||
609 | debug_rt_mutex_print_deadlock(waiter); | 604 | debug_rt_mutex_print_deadlock(waiter); |
610 | 605 | ||
611 | schedule_rt_mutex(lock); | 606 | schedule_rt_mutex(lock); |
612 | 607 | ||
613 | if (was_disabled) | ||
614 | local_irq_disable(); | ||
615 | |||
616 | raw_spin_lock(&lock->wait_lock); | 608 | raw_spin_lock(&lock->wait_lock); |
617 | set_current_state(state); | 609 | set_current_state(state); |
618 | } | 610 | } |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000000..9a7dd35102a3 --- /dev/null +++ b/kernel/sched/Makefile | |||
@@ -0,0 +1,20 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_clock.o = -pg | ||
3 | endif | ||
4 | |||
5 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
6 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
7 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
8 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
9 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
10 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | ||
12 | endif | ||
13 | |||
14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | ||
15 | obj-$(CONFIG_SMP) += cpupri.o | ||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | ||
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | ||
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | ||
19 | |||
20 | |||
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c484..e8a1f83ee0e7 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c | |||
@@ -1,15 +1,19 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include "sched.h" | ||
4 | |||
3 | #include <linux/proc_fs.h> | 5 | #include <linux/proc_fs.h> |
4 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
5 | #include <linux/kallsyms.h> | 7 | #include <linux/kallsyms.h> |
6 | #include <linux/utsname.h> | 8 | #include <linux/utsname.h> |
9 | #include <linux/security.h> | ||
10 | #include <linux/export.h> | ||
7 | 11 | ||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 12 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
9 | static struct autogroup autogroup_default; | 13 | static struct autogroup autogroup_default; |
10 | static atomic_t autogroup_seq_nr; | 14 | static atomic_t autogroup_seq_nr; |
11 | 15 | ||
12 | static void __init autogroup_init(struct task_struct *init_task) | 16 | void __init autogroup_init(struct task_struct *init_task) |
13 | { | 17 | { |
14 | autogroup_default.tg = &root_task_group; | 18 | autogroup_default.tg = &root_task_group; |
15 | kref_init(&autogroup_default.kref); | 19 | kref_init(&autogroup_default.kref); |
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) | |||
17 | init_task->signal->autogroup = &autogroup_default; | 21 | init_task->signal->autogroup = &autogroup_default; |
18 | } | 22 | } |
19 | 23 | ||
20 | static inline void autogroup_free(struct task_group *tg) | 24 | void autogroup_free(struct task_group *tg) |
21 | { | 25 | { |
22 | kfree(tg->autogroup); | 26 | kfree(tg->autogroup); |
23 | } | 27 | } |
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) | |||
59 | return ag; | 63 | return ag; |
60 | } | 64 | } |
61 | 65 | ||
62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
63 | static void free_rt_sched_group(struct task_group *tg); | ||
64 | #endif | ||
65 | |||
66 | static inline struct autogroup *autogroup_create(void) | 66 | static inline struct autogroup *autogroup_create(void) |
67 | { | 67 | { |
68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); |
@@ -108,8 +108,7 @@ out_fail: | |||
108 | return autogroup_kref_get(&autogroup_default); | 108 | return autogroup_kref_get(&autogroup_default); |
109 | } | 109 | } |
110 | 110 | ||
111 | static inline bool | 111 | bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) |
112 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
113 | { | 112 | { |
114 | if (tg != &root_task_group) | 113 | if (tg != &root_task_group) |
115 | return false; | 114 | return false; |
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
127 | return true; | 126 | return true; |
128 | } | 127 | } |
129 | 128 | ||
130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
131 | { | ||
132 | return !!tg->autogroup; | ||
133 | } | ||
134 | |||
135 | static inline struct task_group * | ||
136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
137 | { | ||
138 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
139 | |||
140 | if (enabled && task_wants_autogroup(p, tg)) | ||
141 | return p->signal->autogroup->tg; | ||
142 | |||
143 | return tg; | ||
144 | } | ||
145 | |||
146 | static void | 129 | static void |
147 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 130 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
148 | { | 131 | { |
@@ -263,7 +246,7 @@ out: | |||
263 | #endif /* CONFIG_PROC_FS */ | 246 | #endif /* CONFIG_PROC_FS */ |
264 | 247 | ||
265 | #ifdef CONFIG_SCHED_DEBUG | 248 | #ifdef CONFIG_SCHED_DEBUG |
266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 249 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
267 | { | 250 | { |
268 | if (!task_group_is_autogroup(tg)) | 251 | if (!task_group_is_autogroup(tg)) |
269 | return 0; | 252 | return 0; |
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dca..8bd047142816 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h | |||
@@ -1,5 +1,8 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include <linux/kref.h> | ||
4 | #include <linux/rwsem.h> | ||
5 | |||
3 | struct autogroup { | 6 | struct autogroup { |
4 | /* | 7 | /* |
5 | * reference doesn't mean how many thread attach to this | 8 | * reference doesn't mean how many thread attach to this |
@@ -13,9 +16,28 @@ struct autogroup { | |||
13 | int nice; | 16 | int nice; |
14 | }; | 17 | }; |
15 | 18 | ||
16 | static inline bool task_group_is_autogroup(struct task_group *tg); | 19 | extern void autogroup_init(struct task_struct *init_task); |
20 | extern void autogroup_free(struct task_group *tg); | ||
21 | |||
22 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
23 | { | ||
24 | return !!tg->autogroup; | ||
25 | } | ||
26 | |||
27 | extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); | ||
28 | |||
17 | static inline struct task_group * | 29 | static inline struct task_group * |
18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
31 | { | ||
32 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
33 | |||
34 | if (enabled && task_wants_autogroup(p, tg)) | ||
35 | return p->signal->autogroup->tg; | ||
36 | |||
37 | return tg; | ||
38 | } | ||
39 | |||
40 | extern int autogroup_path(struct task_group *tg, char *buf, int buflen); | ||
19 | 41 | ||
20 | #else /* !CONFIG_SCHED_AUTOGROUP */ | 42 | #else /* !CONFIG_SCHED_AUTOGROUP */ |
21 | 43 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index c685e31492df..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c | |||
diff --git a/kernel/sched.c b/kernel/sched/core.c index a7f381a78469..2a4590fabcad 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
@@ -56,7 +56,6 @@ | |||
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/proc_fs.h> | 57 | #include <linux/proc_fs.h> |
58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
59 | #include <linux/stop_machine.h> | ||
60 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
61 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
62 | #include <linux/times.h> | 61 | #include <linux/times.h> |
@@ -75,129 +74,17 @@ | |||
75 | 74 | ||
76 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
78 | #include <asm/mutex.h> | ||
79 | #ifdef CONFIG_PARAVIRT | 77 | #ifdef CONFIG_PARAVIRT |
80 | #include <asm/paravirt.h> | 78 | #include <asm/paravirt.h> |
81 | #endif | 79 | #endif |
82 | 80 | ||
83 | #include "sched_cpupri.h" | 81 | #include "sched.h" |
84 | #include "workqueue_sched.h" | 82 | #include "../workqueue_sched.h" |
85 | #include "sched_autogroup.h" | ||
86 | 83 | ||
87 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
89 | 86 | ||
90 | /* | 87 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
91 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
92 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
93 | * and back. | ||
94 | */ | ||
95 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
96 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
97 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
98 | |||
99 | /* | ||
100 | * 'User priority' is the nice value converted to something we | ||
101 | * can work with better when scaling various scheduler parameters, | ||
102 | * it's a [ 0 ... 39 ] range. | ||
103 | */ | ||
104 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
105 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
106 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
107 | |||
108 | /* | ||
109 | * Helpers for converting nanosecond timing to jiffy resolution | ||
110 | */ | ||
111 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
112 | |||
113 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
114 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
115 | |||
116 | /* | ||
117 | * These are the 'tuning knobs' of the scheduler: | ||
118 | * | ||
119 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
120 | * Timeslices get refilled after they expire. | ||
121 | */ | ||
122 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
123 | |||
124 | /* | ||
125 | * single value that denotes runtime == period, ie unlimited time. | ||
126 | */ | ||
127 | #define RUNTIME_INF ((u64)~0ULL) | ||
128 | |||
129 | static inline int rt_policy(int policy) | ||
130 | { | ||
131 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
132 | return 1; | ||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | static inline int task_has_rt_policy(struct task_struct *p) | ||
137 | { | ||
138 | return rt_policy(p->policy); | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * This is the priority-queue data structure of the RT scheduling class: | ||
143 | */ | ||
144 | struct rt_prio_array { | ||
145 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
146 | struct list_head queue[MAX_RT_PRIO]; | ||
147 | }; | ||
148 | |||
149 | struct rt_bandwidth { | ||
150 | /* nests inside the rq lock: */ | ||
151 | raw_spinlock_t rt_runtime_lock; | ||
152 | ktime_t rt_period; | ||
153 | u64 rt_runtime; | ||
154 | struct hrtimer rt_period_timer; | ||
155 | }; | ||
156 | |||
157 | static struct rt_bandwidth def_rt_bandwidth; | ||
158 | |||
159 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
160 | |||
161 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
162 | { | ||
163 | struct rt_bandwidth *rt_b = | ||
164 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
165 | ktime_t now; | ||
166 | int overrun; | ||
167 | int idle = 0; | ||
168 | |||
169 | for (;;) { | ||
170 | now = hrtimer_cb_get_time(timer); | ||
171 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
172 | |||
173 | if (!overrun) | ||
174 | break; | ||
175 | |||
176 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
177 | } | ||
178 | |||
179 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
180 | } | ||
181 | |||
182 | static | ||
183 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
184 | { | ||
185 | rt_b->rt_period = ns_to_ktime(period); | ||
186 | rt_b->rt_runtime = runtime; | ||
187 | |||
188 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
189 | |||
190 | hrtimer_init(&rt_b->rt_period_timer, | ||
191 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
192 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
193 | } | ||
194 | |||
195 | static inline int rt_bandwidth_enabled(void) | ||
196 | { | ||
197 | return sysctl_sched_rt_runtime >= 0; | ||
198 | } | ||
199 | |||
200 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | ||
201 | { | 88 | { |
202 | unsigned long delta; | 89 | unsigned long delta; |
203 | ktime_t soft, hard, now; | 90 | ktime_t soft, hard, now; |
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | |||
217 | } | 104 | } |
218 | } | 105 | } |
219 | 106 | ||
220 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 107 | DEFINE_MUTEX(sched_domains_mutex); |
221 | { | 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
222 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
223 | return; | ||
224 | |||
225 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
226 | return; | ||
227 | |||
228 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
229 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
230 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
231 | } | ||
232 | |||
233 | #ifdef CONFIG_RT_GROUP_SCHED | ||
234 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
235 | { | ||
236 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
237 | } | ||
238 | #endif | ||
239 | |||
240 | /* | ||
241 | * sched_domains_mutex serializes calls to init_sched_domains, | ||
242 | * detach_destroy_domains and partition_sched_domains. | ||
243 | */ | ||
244 | static DEFINE_MUTEX(sched_domains_mutex); | ||
245 | |||
246 | #ifdef CONFIG_CGROUP_SCHED | ||
247 | |||
248 | #include <linux/cgroup.h> | ||
249 | |||
250 | struct cfs_rq; | ||
251 | |||
252 | static LIST_HEAD(task_groups); | ||
253 | |||
254 | struct cfs_bandwidth { | ||
255 | #ifdef CONFIG_CFS_BANDWIDTH | ||
256 | raw_spinlock_t lock; | ||
257 | ktime_t period; | ||
258 | u64 quota, runtime; | ||
259 | s64 hierarchal_quota; | ||
260 | u64 runtime_expires; | ||
261 | |||
262 | int idle, timer_active; | ||
263 | struct hrtimer period_timer, slack_timer; | ||
264 | struct list_head throttled_cfs_rq; | ||
265 | |||
266 | /* statistics */ | ||
267 | int nr_periods, nr_throttled; | ||
268 | u64 throttled_time; | ||
269 | #endif | ||
270 | }; | ||
271 | |||
272 | /* task group related information */ | ||
273 | struct task_group { | ||
274 | struct cgroup_subsys_state css; | ||
275 | |||
276 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
277 | /* schedulable entities of this group on each cpu */ | ||
278 | struct sched_entity **se; | ||
279 | /* runqueue "owned" by this group on each cpu */ | ||
280 | struct cfs_rq **cfs_rq; | ||
281 | unsigned long shares; | ||
282 | |||
283 | atomic_t load_weight; | ||
284 | #endif | ||
285 | |||
286 | #ifdef CONFIG_RT_GROUP_SCHED | ||
287 | struct sched_rt_entity **rt_se; | ||
288 | struct rt_rq **rt_rq; | ||
289 | |||
290 | struct rt_bandwidth rt_bandwidth; | ||
291 | #endif | ||
292 | |||
293 | struct rcu_head rcu; | ||
294 | struct list_head list; | ||
295 | |||
296 | struct task_group *parent; | ||
297 | struct list_head siblings; | ||
298 | struct list_head children; | ||
299 | |||
300 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
301 | struct autogroup *autogroup; | ||
302 | #endif | ||
303 | |||
304 | struct cfs_bandwidth cfs_bandwidth; | ||
305 | }; | ||
306 | |||
307 | /* task_group_lock serializes the addition/removal of task groups */ | ||
308 | static DEFINE_SPINLOCK(task_group_lock); | ||
309 | |||
310 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
311 | |||
312 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
313 | |||
314 | /* | ||
315 | * A weight of 0 or 1 can cause arithmetics problems. | ||
316 | * A weight of a cfs_rq is the sum of weights of which entities | ||
317 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
318 | * too large, so as the shares value of a task group. | ||
319 | * (The default weight is 1024 - so there's no practical | ||
320 | * limitation from this.) | ||
321 | */ | ||
322 | #define MIN_SHARES (1UL << 1) | ||
323 | #define MAX_SHARES (1UL << 18) | ||
324 | |||
325 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | ||
326 | #endif | ||
327 | |||
328 | /* Default task group. | ||
329 | * Every task in system belong to this group at bootup. | ||
330 | */ | ||
331 | struct task_group root_task_group; | ||
332 | |||
333 | #endif /* CONFIG_CGROUP_SCHED */ | ||
334 | |||
335 | /* CFS-related fields in a runqueue */ | ||
336 | struct cfs_rq { | ||
337 | struct load_weight load; | ||
338 | unsigned long nr_running, h_nr_running; | ||
339 | |||
340 | u64 exec_clock; | ||
341 | u64 min_vruntime; | ||
342 | #ifndef CONFIG_64BIT | ||
343 | u64 min_vruntime_copy; | ||
344 | #endif | ||
345 | |||
346 | struct rb_root tasks_timeline; | ||
347 | struct rb_node *rb_leftmost; | ||
348 | |||
349 | struct list_head tasks; | ||
350 | struct list_head *balance_iterator; | ||
351 | |||
352 | /* | ||
353 | * 'curr' points to currently running entity on this cfs_rq. | ||
354 | * It is set to NULL otherwise (i.e when none are currently running). | ||
355 | */ | ||
356 | struct sched_entity *curr, *next, *last, *skip; | ||
357 | |||
358 | #ifdef CONFIG_SCHED_DEBUG | ||
359 | unsigned int nr_spread_over; | ||
360 | #endif | ||
361 | |||
362 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
363 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
364 | |||
365 | /* | ||
366 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
367 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
368 | * (like users, containers etc.) | ||
369 | * | ||
370 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
371 | * list is used during load balance. | ||
372 | */ | ||
373 | int on_list; | ||
374 | struct list_head leaf_cfs_rq_list; | ||
375 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
376 | |||
377 | #ifdef CONFIG_SMP | ||
378 | /* | ||
379 | * the part of load.weight contributed by tasks | ||
380 | */ | ||
381 | unsigned long task_weight; | ||
382 | |||
383 | /* | ||
384 | * h_load = weight * f(tg) | ||
385 | * | ||
386 | * Where f(tg) is the recursive weight fraction assigned to | ||
387 | * this group. | ||
388 | */ | ||
389 | unsigned long h_load; | ||
390 | |||
391 | /* | ||
392 | * Maintaining per-cpu shares distribution for group scheduling | ||
393 | * | ||
394 | * load_stamp is the last time we updated the load average | ||
395 | * load_last is the last time we updated the load average and saw load | ||
396 | * load_unacc_exec_time is currently unaccounted execution time | ||
397 | */ | ||
398 | u64 load_avg; | ||
399 | u64 load_period; | ||
400 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
401 | |||
402 | unsigned long load_contribution; | ||
403 | #endif | ||
404 | #ifdef CONFIG_CFS_BANDWIDTH | ||
405 | int runtime_enabled; | ||
406 | u64 runtime_expires; | ||
407 | s64 runtime_remaining; | ||
408 | |||
409 | u64 throttled_timestamp; | ||
410 | int throttled, throttle_count; | ||
411 | struct list_head throttled_list; | ||
412 | #endif | ||
413 | #endif | ||
414 | }; | ||
415 | |||
416 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
417 | #ifdef CONFIG_CFS_BANDWIDTH | ||
418 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
419 | { | ||
420 | return &tg->cfs_bandwidth; | ||
421 | } | ||
422 | |||
423 | static inline u64 default_cfs_period(void); | ||
424 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
425 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
426 | |||
427 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
428 | { | ||
429 | struct cfs_bandwidth *cfs_b = | ||
430 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
431 | do_sched_cfs_slack_timer(cfs_b); | ||
432 | |||
433 | return HRTIMER_NORESTART; | ||
434 | } | ||
435 | |||
436 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
437 | { | ||
438 | struct cfs_bandwidth *cfs_b = | ||
439 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
440 | ktime_t now; | ||
441 | int overrun; | ||
442 | int idle = 0; | ||
443 | |||
444 | for (;;) { | ||
445 | now = hrtimer_cb_get_time(timer); | ||
446 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
447 | |||
448 | if (!overrun) | ||
449 | break; | ||
450 | |||
451 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
452 | } | ||
453 | |||
454 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
455 | } | ||
456 | |||
457 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
458 | { | ||
459 | raw_spin_lock_init(&cfs_b->lock); | ||
460 | cfs_b->runtime = 0; | ||
461 | cfs_b->quota = RUNTIME_INF; | ||
462 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
463 | |||
464 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
465 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
466 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
467 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
468 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
469 | } | ||
470 | |||
471 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
472 | { | ||
473 | cfs_rq->runtime_enabled = 0; | ||
474 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
475 | } | ||
476 | |||
477 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
478 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
479 | { | ||
480 | /* | ||
481 | * The timer may be active because we're trying to set a new bandwidth | ||
482 | * period or because we're racing with the tear-down path | ||
483 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
484 | * terminates). In either case we ensure that it's re-programmed | ||
485 | */ | ||
486 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
487 | raw_spin_unlock(&cfs_b->lock); | ||
488 | /* ensure cfs_b->lock is available while we wait */ | ||
489 | hrtimer_cancel(&cfs_b->period_timer); | ||
490 | |||
491 | raw_spin_lock(&cfs_b->lock); | ||
492 | /* if someone else restarted the timer then we're done */ | ||
493 | if (cfs_b->timer_active) | ||
494 | return; | ||
495 | } | ||
496 | |||
497 | cfs_b->timer_active = 1; | ||
498 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
499 | } | ||
500 | |||
501 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
502 | { | ||
503 | hrtimer_cancel(&cfs_b->period_timer); | ||
504 | hrtimer_cancel(&cfs_b->slack_timer); | ||
505 | } | ||
506 | #else | ||
507 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
508 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
510 | |||
511 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
512 | { | ||
513 | return NULL; | ||
514 | } | ||
515 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
516 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
517 | |||
518 | /* Real-Time classes' related field in a runqueue: */ | ||
519 | struct rt_rq { | ||
520 | struct rt_prio_array active; | ||
521 | unsigned long rt_nr_running; | ||
522 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
523 | struct { | ||
524 | int curr; /* highest queued rt task prio */ | ||
525 | #ifdef CONFIG_SMP | ||
526 | int next; /* next highest */ | ||
527 | #endif | ||
528 | } highest_prio; | ||
529 | #endif | ||
530 | #ifdef CONFIG_SMP | ||
531 | unsigned long rt_nr_migratory; | ||
532 | unsigned long rt_nr_total; | ||
533 | int overloaded; | ||
534 | struct plist_head pushable_tasks; | ||
535 | #endif | ||
536 | int rt_throttled; | ||
537 | u64 rt_time; | ||
538 | u64 rt_runtime; | ||
539 | /* Nests inside the rq lock: */ | ||
540 | raw_spinlock_t rt_runtime_lock; | ||
541 | |||
542 | #ifdef CONFIG_RT_GROUP_SCHED | ||
543 | unsigned long rt_nr_boosted; | ||
544 | |||
545 | struct rq *rq; | ||
546 | struct list_head leaf_rt_rq_list; | ||
547 | struct task_group *tg; | ||
548 | #endif | ||
549 | }; | ||
550 | |||
551 | #ifdef CONFIG_SMP | ||
552 | |||
553 | /* | ||
554 | * We add the notion of a root-domain which will be used to define per-domain | ||
555 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
556 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
557 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
558 | * object. | ||
559 | * | ||
560 | */ | ||
561 | struct root_domain { | ||
562 | atomic_t refcount; | ||
563 | atomic_t rto_count; | ||
564 | struct rcu_head rcu; | ||
565 | cpumask_var_t span; | ||
566 | cpumask_var_t online; | ||
567 | |||
568 | /* | ||
569 | * The "RT overload" flag: it gets set if a CPU has more than | ||
570 | * one runnable RT task. | ||
571 | */ | ||
572 | cpumask_var_t rto_mask; | ||
573 | struct cpupri cpupri; | ||
574 | }; | ||
575 | |||
576 | /* | ||
577 | * By default the system creates a single root-domain with all cpus as | ||
578 | * members (mimicking the global state we have today). | ||
579 | */ | ||
580 | static struct root_domain def_root_domain; | ||
581 | |||
582 | #endif /* CONFIG_SMP */ | ||
583 | |||
584 | /* | ||
585 | * This is the main, per-CPU runqueue data structure. | ||
586 | * | ||
587 | * Locking rule: those places that want to lock multiple runqueues | ||
588 | * (such as the load balancing or the thread migration code), lock | ||
589 | * acquire operations must be ordered by ascending &runqueue. | ||
590 | */ | ||
591 | struct rq { | ||
592 | /* runqueue lock: */ | ||
593 | raw_spinlock_t lock; | ||
594 | |||
595 | /* | ||
596 | * nr_running and cpu_load should be in the same cacheline because | ||
597 | * remote CPUs use both these fields when doing load calculation. | ||
598 | */ | ||
599 | unsigned long nr_running; | ||
600 | #define CPU_LOAD_IDX_MAX 5 | ||
601 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
602 | unsigned long last_load_update_tick; | ||
603 | #ifdef CONFIG_NO_HZ | ||
604 | u64 nohz_stamp; | ||
605 | unsigned char nohz_balance_kick; | ||
606 | #endif | ||
607 | int skip_clock_update; | ||
608 | |||
609 | /* capture load from *all* tasks on this cpu: */ | ||
610 | struct load_weight load; | ||
611 | unsigned long nr_load_updates; | ||
612 | u64 nr_switches; | ||
613 | |||
614 | struct cfs_rq cfs; | ||
615 | struct rt_rq rt; | ||
616 | |||
617 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
618 | /* list of leaf cfs_rq on this cpu: */ | ||
619 | struct list_head leaf_cfs_rq_list; | ||
620 | #endif | ||
621 | #ifdef CONFIG_RT_GROUP_SCHED | ||
622 | struct list_head leaf_rt_rq_list; | ||
623 | #endif | ||
624 | |||
625 | /* | ||
626 | * This is part of a global counter where only the total sum | ||
627 | * over all CPUs matters. A task can increase this counter on | ||
628 | * one CPU and if it got migrated afterwards it may decrease | ||
629 | * it on another CPU. Always updated under the runqueue lock: | ||
630 | */ | ||
631 | unsigned long nr_uninterruptible; | ||
632 | |||
633 | struct task_struct *curr, *idle, *stop; | ||
634 | unsigned long next_balance; | ||
635 | struct mm_struct *prev_mm; | ||
636 | |||
637 | u64 clock; | ||
638 | u64 clock_task; | ||
639 | |||
640 | atomic_t nr_iowait; | ||
641 | |||
642 | #ifdef CONFIG_SMP | ||
643 | struct root_domain *rd; | ||
644 | struct sched_domain *sd; | ||
645 | |||
646 | unsigned long cpu_power; | ||
647 | |||
648 | unsigned char idle_balance; | ||
649 | /* For active balancing */ | ||
650 | int post_schedule; | ||
651 | int active_balance; | ||
652 | int push_cpu; | ||
653 | struct cpu_stop_work active_balance_work; | ||
654 | /* cpu of this runqueue: */ | ||
655 | int cpu; | ||
656 | int online; | ||
657 | |||
658 | u64 rt_avg; | ||
659 | u64 age_stamp; | ||
660 | u64 idle_stamp; | ||
661 | u64 avg_idle; | ||
662 | #endif | ||
663 | |||
664 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
665 | u64 prev_irq_time; | ||
666 | #endif | ||
667 | #ifdef CONFIG_PARAVIRT | ||
668 | u64 prev_steal_time; | ||
669 | #endif | ||
670 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
671 | u64 prev_steal_time_rq; | ||
672 | #endif | ||
673 | |||
674 | /* calc_load related fields */ | ||
675 | unsigned long calc_load_update; | ||
676 | long calc_load_active; | ||
677 | |||
678 | #ifdef CONFIG_SCHED_HRTICK | ||
679 | #ifdef CONFIG_SMP | ||
680 | int hrtick_csd_pending; | ||
681 | struct call_single_data hrtick_csd; | ||
682 | #endif | ||
683 | struct hrtimer hrtick_timer; | ||
684 | #endif | ||
685 | |||
686 | #ifdef CONFIG_SCHEDSTATS | ||
687 | /* latency stats */ | ||
688 | struct sched_info rq_sched_info; | ||
689 | unsigned long long rq_cpu_time; | ||
690 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
691 | |||
692 | /* sys_sched_yield() stats */ | ||
693 | unsigned int yld_count; | ||
694 | |||
695 | /* schedule() stats */ | ||
696 | unsigned int sched_switch; | ||
697 | unsigned int sched_count; | ||
698 | unsigned int sched_goidle; | ||
699 | |||
700 | /* try_to_wake_up() stats */ | ||
701 | unsigned int ttwu_count; | ||
702 | unsigned int ttwu_local; | ||
703 | #endif | ||
704 | |||
705 | #ifdef CONFIG_SMP | ||
706 | struct llist_head wake_list; | ||
707 | #endif | ||
708 | }; | ||
709 | |||
710 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | ||
711 | |||
712 | |||
713 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
714 | |||
715 | static inline int cpu_of(struct rq *rq) | ||
716 | { | ||
717 | #ifdef CONFIG_SMP | ||
718 | return rq->cpu; | ||
719 | #else | ||
720 | return 0; | ||
721 | #endif | ||
722 | } | ||
723 | |||
724 | #define rcu_dereference_check_sched_domain(p) \ | ||
725 | rcu_dereference_check((p), \ | ||
726 | lockdep_is_held(&sched_domains_mutex)) | ||
727 | |||
728 | /* | ||
729 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
730 | * See detach_destroy_domains: synchronize_sched for details. | ||
731 | * | ||
732 | * The domain tree of any CPU may only be accessed from within | ||
733 | * preempt-disabled sections. | ||
734 | */ | ||
735 | #define for_each_domain(cpu, __sd) \ | ||
736 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | ||
737 | |||
738 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
739 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
740 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
741 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
742 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
743 | |||
744 | #ifdef CONFIG_CGROUP_SCHED | ||
745 | |||
746 | /* | ||
747 | * Return the group to which this tasks belongs. | ||
748 | * | ||
749 | * We use task_subsys_state_check() and extend the RCU verification with | ||
750 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
751 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
752 | * we pin the task to the current cgroup. | ||
753 | */ | ||
754 | static inline struct task_group *task_group(struct task_struct *p) | ||
755 | { | ||
756 | struct task_group *tg; | ||
757 | struct cgroup_subsys_state *css; | ||
758 | |||
759 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
760 | lockdep_is_held(&p->pi_lock) || | ||
761 | lockdep_is_held(&task_rq(p)->lock)); | ||
762 | tg = container_of(css, struct task_group, css); | ||
763 | |||
764 | return autogroup_task_group(p, tg); | ||
765 | } | ||
766 | |||
767 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
768 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
769 | { | ||
770 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
771 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | ||
772 | p->se.parent = task_group(p)->se[cpu]; | ||
773 | #endif | ||
774 | |||
775 | #ifdef CONFIG_RT_GROUP_SCHED | ||
776 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
777 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
778 | #endif | ||
779 | } | ||
780 | |||
781 | #else /* CONFIG_CGROUP_SCHED */ | ||
782 | |||
783 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
784 | static inline struct task_group *task_group(struct task_struct *p) | ||
785 | { | ||
786 | return NULL; | ||
787 | } | ||
788 | |||
789 | #endif /* CONFIG_CGROUP_SCHED */ | ||
790 | 109 | ||
791 | static void update_rq_clock_task(struct rq *rq, s64 delta); | 110 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
792 | 111 | ||
793 | static void update_rq_clock(struct rq *rq) | 112 | void update_rq_clock(struct rq *rq) |
794 | { | 113 | { |
795 | s64 delta; | 114 | s64 delta; |
796 | 115 | ||
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq) | |||
803 | } | 122 | } |
804 | 123 | ||
805 | /* | 124 | /* |
806 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
807 | */ | ||
808 | #ifdef CONFIG_SCHED_DEBUG | ||
809 | # define const_debug __read_mostly | ||
810 | #else | ||
811 | # define const_debug static const | ||
812 | #endif | ||
813 | |||
814 | /** | ||
815 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked | ||
816 | * @cpu: the processor in question. | ||
817 | * | ||
818 | * This interface allows printk to be called with the runqueue lock | ||
819 | * held and know whether or not it is OK to wake up the klogd. | ||
820 | */ | ||
821 | int runqueue_is_locked(int cpu) | ||
822 | { | ||
823 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); | ||
824 | } | ||
825 | |||
826 | /* | ||
827 | * Debugging: various feature bits | 125 | * Debugging: various feature bits |
828 | */ | 126 | */ |
829 | 127 | ||
830 | #define SCHED_FEAT(name, enabled) \ | 128 | #define SCHED_FEAT(name, enabled) \ |
831 | __SCHED_FEAT_##name , | ||
832 | |||
833 | enum { | ||
834 | #include "sched_features.h" | ||
835 | }; | ||
836 | |||
837 | #undef SCHED_FEAT | ||
838 | |||
839 | #define SCHED_FEAT(name, enabled) \ | ||
840 | (1UL << __SCHED_FEAT_##name) * enabled | | 129 | (1UL << __SCHED_FEAT_##name) * enabled | |
841 | 130 | ||
842 | const_debug unsigned int sysctl_sched_features = | 131 | const_debug unsigned int sysctl_sched_features = |
843 | #include "sched_features.h" | 132 | #include "features.h" |
844 | 0; | 133 | 0; |
845 | 134 | ||
846 | #undef SCHED_FEAT | 135 | #undef SCHED_FEAT |
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features = | |||
850 | #name , | 139 | #name , |
851 | 140 | ||
852 | static __read_mostly char *sched_feat_names[] = { | 141 | static __read_mostly char *sched_feat_names[] = { |
853 | #include "sched_features.h" | 142 | #include "features.h" |
854 | NULL | 143 | NULL |
855 | }; | 144 | }; |
856 | 145 | ||
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
860 | { | 149 | { |
861 | int i; | 150 | int i; |
862 | 151 | ||
863 | for (i = 0; sched_feat_names[i]; i++) { | 152 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
864 | if (!(sysctl_sched_features & (1UL << i))) | 153 | if (!(sysctl_sched_features & (1UL << i))) |
865 | seq_puts(m, "NO_"); | 154 | seq_puts(m, "NO_"); |
866 | seq_printf(m, "%s ", sched_feat_names[i]); | 155 | seq_printf(m, "%s ", sched_feat_names[i]); |
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
870 | return 0; | 159 | return 0; |
871 | } | 160 | } |
872 | 161 | ||
162 | #ifdef HAVE_JUMP_LABEL | ||
163 | |||
164 | #define jump_label_key__true jump_label_key_enabled | ||
165 | #define jump_label_key__false jump_label_key_disabled | ||
166 | |||
167 | #define SCHED_FEAT(name, enabled) \ | ||
168 | jump_label_key__##enabled , | ||
169 | |||
170 | struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
171 | #include "features.h" | ||
172 | }; | ||
173 | |||
174 | #undef SCHED_FEAT | ||
175 | |||
176 | static void sched_feat_disable(int i) | ||
177 | { | ||
178 | if (jump_label_enabled(&sched_feat_keys[i])) | ||
179 | jump_label_dec(&sched_feat_keys[i]); | ||
180 | } | ||
181 | |||
182 | static void sched_feat_enable(int i) | ||
183 | { | ||
184 | if (!jump_label_enabled(&sched_feat_keys[i])) | ||
185 | jump_label_inc(&sched_feat_keys[i]); | ||
186 | } | ||
187 | #else | ||
188 | static void sched_feat_disable(int i) { }; | ||
189 | static void sched_feat_enable(int i) { }; | ||
190 | #endif /* HAVE_JUMP_LABEL */ | ||
191 | |||
873 | static ssize_t | 192 | static ssize_t |
874 | sched_feat_write(struct file *filp, const char __user *ubuf, | 193 | sched_feat_write(struct file *filp, const char __user *ubuf, |
875 | size_t cnt, loff_t *ppos) | 194 | size_t cnt, loff_t *ppos) |
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
893 | cmp += 3; | 212 | cmp += 3; |
894 | } | 213 | } |
895 | 214 | ||
896 | for (i = 0; sched_feat_names[i]; i++) { | 215 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
897 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 216 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
898 | if (neg) | 217 | if (neg) { |
899 | sysctl_sched_features &= ~(1UL << i); | 218 | sysctl_sched_features &= ~(1UL << i); |
900 | else | 219 | sched_feat_disable(i); |
220 | } else { | ||
901 | sysctl_sched_features |= (1UL << i); | 221 | sysctl_sched_features |= (1UL << i); |
222 | sched_feat_enable(i); | ||
223 | } | ||
902 | break; | 224 | break; |
903 | } | 225 | } |
904 | } | 226 | } |
905 | 227 | ||
906 | if (!sched_feat_names[i]) | 228 | if (i == __SCHED_FEAT_NR) |
907 | return -EINVAL; | 229 | return -EINVAL; |
908 | 230 | ||
909 | *ppos += cnt; | 231 | *ppos += cnt; |
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void) | |||
932 | return 0; | 254 | return 0; |
933 | } | 255 | } |
934 | late_initcall(sched_init_debug); | 256 | late_initcall(sched_init_debug); |
935 | 257 | #endif /* CONFIG_SCHED_DEBUG */ | |
936 | #endif | ||
937 | |||
938 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
939 | 258 | ||
940 | /* | 259 | /* |
941 | * Number of tasks to iterate in a single balance run. | 260 | * Number of tasks to iterate in a single balance run. |
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | |||
957 | */ | 276 | */ |
958 | unsigned int sysctl_sched_rt_period = 1000000; | 277 | unsigned int sysctl_sched_rt_period = 1000000; |
959 | 278 | ||
960 | static __read_mostly int scheduler_running; | 279 | __read_mostly int scheduler_running; |
961 | 280 | ||
962 | /* | 281 | /* |
963 | * part of the period that we allow rt tasks to run in us. | 282 | * part of the period that we allow rt tasks to run in us. |
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running; | |||
965 | */ | 284 | */ |
966 | int sysctl_sched_rt_runtime = 950000; | 285 | int sysctl_sched_rt_runtime = 950000; |
967 | 286 | ||
968 | static inline u64 global_rt_period(void) | ||
969 | { | ||
970 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
971 | } | ||
972 | 287 | ||
973 | static inline u64 global_rt_runtime(void) | ||
974 | { | ||
975 | if (sysctl_sched_rt_runtime < 0) | ||
976 | return RUNTIME_INF; | ||
977 | |||
978 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
979 | } | ||
980 | |||
981 | #ifndef prepare_arch_switch | ||
982 | # define prepare_arch_switch(next) do { } while (0) | ||
983 | #endif | ||
984 | #ifndef finish_arch_switch | ||
985 | # define finish_arch_switch(prev) do { } while (0) | ||
986 | #endif | ||
987 | |||
988 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
989 | { | ||
990 | return rq->curr == p; | ||
991 | } | ||
992 | |||
993 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
994 | { | ||
995 | #ifdef CONFIG_SMP | ||
996 | return p->on_cpu; | ||
997 | #else | ||
998 | return task_current(rq, p); | ||
999 | #endif | ||
1000 | } | ||
1001 | |||
1002 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
1003 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
1004 | { | ||
1005 | #ifdef CONFIG_SMP | ||
1006 | /* | ||
1007 | * We can optimise this out completely for !SMP, because the | ||
1008 | * SMP rebalancing from interrupt is the only thing that cares | ||
1009 | * here. | ||
1010 | */ | ||
1011 | next->on_cpu = 1; | ||
1012 | #endif | ||
1013 | } | ||
1014 | |||
1015 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
1016 | { | ||
1017 | #ifdef CONFIG_SMP | ||
1018 | /* | ||
1019 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
1020 | * We must ensure this doesn't happen until the switch is completely | ||
1021 | * finished. | ||
1022 | */ | ||
1023 | smp_wmb(); | ||
1024 | prev->on_cpu = 0; | ||
1025 | #endif | ||
1026 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
1027 | /* this is a valid case when another task releases the spinlock */ | ||
1028 | rq->lock.owner = current; | ||
1029 | #endif | ||
1030 | /* | ||
1031 | * If we are tracking spinlock dependencies then we have to | ||
1032 | * fix up the runqueue lock - which gets 'carried over' from | ||
1033 | * prev into current: | ||
1034 | */ | ||
1035 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
1036 | |||
1037 | raw_spin_unlock_irq(&rq->lock); | ||
1038 | } | ||
1039 | |||
1040 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
1041 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
1042 | { | ||
1043 | #ifdef CONFIG_SMP | ||
1044 | /* | ||
1045 | * We can optimise this out completely for !SMP, because the | ||
1046 | * SMP rebalancing from interrupt is the only thing that cares | ||
1047 | * here. | ||
1048 | */ | ||
1049 | next->on_cpu = 1; | ||
1050 | #endif | ||
1051 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1052 | raw_spin_unlock_irq(&rq->lock); | ||
1053 | #else | ||
1054 | raw_spin_unlock(&rq->lock); | ||
1055 | #endif | ||
1056 | } | ||
1057 | |||
1058 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
1059 | { | ||
1060 | #ifdef CONFIG_SMP | ||
1061 | /* | ||
1062 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
1063 | * We must ensure this doesn't happen until the switch is completely | ||
1064 | * finished. | ||
1065 | */ | ||
1066 | smp_wmb(); | ||
1067 | prev->on_cpu = 0; | ||
1068 | #endif | ||
1069 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1070 | local_irq_enable(); | ||
1071 | #endif | ||
1072 | } | ||
1073 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
1074 | 288 | ||
1075 | /* | 289 | /* |
1076 | * __task_rq_lock - lock the rq @p resides on. | 290 | * __task_rq_lock - lock the rq @p resides on. |
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void) | |||
1153 | * rq->lock. | 367 | * rq->lock. |
1154 | */ | 368 | */ |
1155 | 369 | ||
1156 | /* | ||
1157 | * Use hrtick when: | ||
1158 | * - enabled by features | ||
1159 | * - hrtimer is actually high res | ||
1160 | */ | ||
1161 | static inline int hrtick_enabled(struct rq *rq) | ||
1162 | { | ||
1163 | if (!sched_feat(HRTICK)) | ||
1164 | return 0; | ||
1165 | if (!cpu_active(cpu_of(rq))) | ||
1166 | return 0; | ||
1167 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
1168 | } | ||
1169 | |||
1170 | static void hrtick_clear(struct rq *rq) | 370 | static void hrtick_clear(struct rq *rq) |
1171 | { | 371 | { |
1172 | if (hrtimer_active(&rq->hrtick_timer)) | 372 | if (hrtimer_active(&rq->hrtick_timer)) |
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg) | |||
1210 | * | 410 | * |
1211 | * called with rq->lock held and irqs disabled | 411 | * called with rq->lock held and irqs disabled |
1212 | */ | 412 | */ |
1213 | static void hrtick_start(struct rq *rq, u64 delay) | 413 | void hrtick_start(struct rq *rq, u64 delay) |
1214 | { | 414 | { |
1215 | struct hrtimer *timer = &rq->hrtick_timer; | 415 | struct hrtimer *timer = &rq->hrtick_timer; |
1216 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 416 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void) | |||
1254 | * | 454 | * |
1255 | * called with rq->lock held and irqs disabled | 455 | * called with rq->lock held and irqs disabled |
1256 | */ | 456 | */ |
1257 | static void hrtick_start(struct rq *rq, u64 delay) | 457 | void hrtick_start(struct rq *rq, u64 delay) |
1258 | { | 458 | { |
1259 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 459 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
1260 | HRTIMER_MODE_REL_PINNED, 0); | 460 | HRTIMER_MODE_REL_PINNED, 0); |
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void) | |||
1305 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 505 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1306 | #endif | 506 | #endif |
1307 | 507 | ||
1308 | static void resched_task(struct task_struct *p) | 508 | void resched_task(struct task_struct *p) |
1309 | { | 509 | { |
1310 | int cpu; | 510 | int cpu; |
1311 | 511 | ||
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p) | |||
1326 | smp_send_reschedule(cpu); | 526 | smp_send_reschedule(cpu); |
1327 | } | 527 | } |
1328 | 528 | ||
1329 | static void resched_cpu(int cpu) | 529 | void resched_cpu(int cpu) |
1330 | { | 530 | { |
1331 | struct rq *rq = cpu_rq(cpu); | 531 | struct rq *rq = cpu_rq(cpu); |
1332 | unsigned long flags; | 532 | unsigned long flags; |
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu) | |||
1407 | 607 | ||
1408 | static inline bool got_nohz_idle_kick(void) | 608 | static inline bool got_nohz_idle_kick(void) |
1409 | { | 609 | { |
1410 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | 610 | int cpu = smp_processor_id(); |
611 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | ||
1411 | } | 612 | } |
1412 | 613 | ||
1413 | #else /* CONFIG_NO_HZ */ | 614 | #else /* CONFIG_NO_HZ */ |
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void) | |||
1419 | 620 | ||
1420 | #endif /* CONFIG_NO_HZ */ | 621 | #endif /* CONFIG_NO_HZ */ |
1421 | 622 | ||
1422 | static u64 sched_avg_period(void) | 623 | void sched_avg_update(struct rq *rq) |
1423 | { | ||
1424 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1425 | } | ||
1426 | |||
1427 | static void sched_avg_update(struct rq *rq) | ||
1428 | { | 624 | { |
1429 | s64 period = sched_avg_period(); | 625 | s64 period = sched_avg_period(); |
1430 | 626 | ||
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq) | |||
1440 | } | 636 | } |
1441 | } | 637 | } |
1442 | 638 | ||
1443 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1444 | { | ||
1445 | rq->rt_avg += rt_delta; | ||
1446 | sched_avg_update(rq); | ||
1447 | } | ||
1448 | |||
1449 | #else /* !CONFIG_SMP */ | 639 | #else /* !CONFIG_SMP */ |
1450 | static void resched_task(struct task_struct *p) | 640 | void resched_task(struct task_struct *p) |
1451 | { | 641 | { |
1452 | assert_raw_spin_locked(&task_rq(p)->lock); | 642 | assert_raw_spin_locked(&task_rq(p)->lock); |
1453 | set_tsk_need_resched(p); | 643 | set_tsk_need_resched(p); |
1454 | } | 644 | } |
1455 | |||
1456 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1457 | { | ||
1458 | } | ||
1459 | |||
1460 | static void sched_avg_update(struct rq *rq) | ||
1461 | { | ||
1462 | } | ||
1463 | #endif /* CONFIG_SMP */ | 645 | #endif /* CONFIG_SMP */ |
1464 | 646 | ||
1465 | #if BITS_PER_LONG == 32 | ||
1466 | # define WMULT_CONST (~0UL) | ||
1467 | #else | ||
1468 | # define WMULT_CONST (1UL << 32) | ||
1469 | #endif | ||
1470 | |||
1471 | #define WMULT_SHIFT 32 | ||
1472 | |||
1473 | /* | ||
1474 | * Shift right and round: | ||
1475 | */ | ||
1476 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
1477 | |||
1478 | /* | ||
1479 | * delta *= weight / lw | ||
1480 | */ | ||
1481 | static unsigned long | ||
1482 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
1483 | struct load_weight *lw) | ||
1484 | { | ||
1485 | u64 tmp; | ||
1486 | |||
1487 | /* | ||
1488 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1489 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1490 | * 2^SCHED_LOAD_RESOLUTION. | ||
1491 | */ | ||
1492 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1493 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1494 | else | ||
1495 | tmp = (u64)delta_exec; | ||
1496 | |||
1497 | if (!lw->inv_weight) { | ||
1498 | unsigned long w = scale_load_down(lw->weight); | ||
1499 | |||
1500 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1501 | lw->inv_weight = 1; | ||
1502 | else if (unlikely(!w)) | ||
1503 | lw->inv_weight = WMULT_CONST; | ||
1504 | else | ||
1505 | lw->inv_weight = WMULT_CONST / w; | ||
1506 | } | ||
1507 | |||
1508 | /* | ||
1509 | * Check whether we'd overflow the 64-bit multiplication: | ||
1510 | */ | ||
1511 | if (unlikely(tmp > WMULT_CONST)) | ||
1512 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
1513 | WMULT_SHIFT/2); | ||
1514 | else | ||
1515 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
1516 | |||
1517 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
1518 | } | ||
1519 | |||
1520 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
1521 | { | ||
1522 | lw->weight += inc; | ||
1523 | lw->inv_weight = 0; | ||
1524 | } | ||
1525 | |||
1526 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
1527 | { | ||
1528 | lw->weight -= dec; | ||
1529 | lw->inv_weight = 0; | ||
1530 | } | ||
1531 | |||
1532 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1533 | { | ||
1534 | lw->weight = w; | ||
1535 | lw->inv_weight = 0; | ||
1536 | } | ||
1537 | |||
1538 | /* | ||
1539 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
1540 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
1541 | * each task makes to its run queue's load is weighted according to its | ||
1542 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
1543 | * scaled version of the new time slice allocation that they receive on time | ||
1544 | * slice expiry etc. | ||
1545 | */ | ||
1546 | |||
1547 | #define WEIGHT_IDLEPRIO 3 | ||
1548 | #define WMULT_IDLEPRIO 1431655765 | ||
1549 | |||
1550 | /* | ||
1551 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
1552 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
1553 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
1554 | * that remained on nice 0. | ||
1555 | * | ||
1556 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
1557 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
1558 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
1559 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
1560 | * the relative distance between them is ~25%.) | ||
1561 | */ | ||
1562 | static const int prio_to_weight[40] = { | ||
1563 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
1564 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
1565 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
1566 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
1567 | /* 0 */ 1024, 820, 655, 526, 423, | ||
1568 | /* 5 */ 335, 272, 215, 172, 137, | ||
1569 | /* 10 */ 110, 87, 70, 56, 45, | ||
1570 | /* 15 */ 36, 29, 23, 18, 15, | ||
1571 | }; | ||
1572 | |||
1573 | /* | ||
1574 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
1575 | * | ||
1576 | * In cases where the weight does not change often, we can use the | ||
1577 | * precalculated inverse to speed up arithmetics by turning divisions | ||
1578 | * into multiplications: | ||
1579 | */ | ||
1580 | static const u32 prio_to_wmult[40] = { | ||
1581 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
1582 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
1583 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
1584 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
1585 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
1586 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
1587 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
1588 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
1589 | }; | ||
1590 | |||
1591 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
1592 | enum cpuacct_stat_index { | ||
1593 | CPUACCT_STAT_USER, /* ... user mode */ | ||
1594 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
1595 | |||
1596 | CPUACCT_STAT_NSTATS, | ||
1597 | }; | ||
1598 | |||
1599 | #ifdef CONFIG_CGROUP_CPUACCT | ||
1600 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
1601 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
1602 | enum cpuacct_stat_index idx, cputime_t val); | ||
1603 | #else | ||
1604 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
1605 | static inline void cpuacct_update_stats(struct task_struct *tsk, | ||
1606 | enum cpuacct_stat_index idx, cputime_t val) {} | ||
1607 | #endif | ||
1608 | |||
1609 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1610 | { | ||
1611 | update_load_add(&rq->load, load); | ||
1612 | } | ||
1613 | |||
1614 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1615 | { | ||
1616 | update_load_sub(&rq->load, load); | ||
1617 | } | ||
1618 | |||
1619 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 647 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1620 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | 648 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) |
1621 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
1622 | |||
1623 | /* | 649 | /* |
1624 | * Iterate task_group tree rooted at *from, calling @down when first entering a | 650 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1625 | * node and @up when leaving it for the final time. | 651 | * node and @up when leaving it for the final time. |
1626 | * | 652 | * |
1627 | * Caller must hold rcu_lock or sufficient equivalent. | 653 | * Caller must hold rcu_lock or sufficient equivalent. |
1628 | */ | 654 | */ |
1629 | static int walk_tg_tree_from(struct task_group *from, | 655 | int walk_tg_tree_from(struct task_group *from, |
1630 | tg_visitor down, tg_visitor up, void *data) | 656 | tg_visitor down, tg_visitor up, void *data) |
1631 | { | 657 | { |
1632 | struct task_group *parent, *child; | 658 | struct task_group *parent, *child; |
@@ -1657,270 +683,13 @@ out: | |||
1657 | return ret; | 683 | return ret; |
1658 | } | 684 | } |
1659 | 685 | ||
1660 | /* | 686 | int tg_nop(struct task_group *tg, void *data) |
1661 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1662 | * leaving it for the final time. | ||
1663 | * | ||
1664 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1665 | */ | ||
1666 | |||
1667 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1668 | { | ||
1669 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1670 | } | ||
1671 | |||
1672 | static int tg_nop(struct task_group *tg, void *data) | ||
1673 | { | 687 | { |
1674 | return 0; | 688 | return 0; |
1675 | } | 689 | } |
1676 | #endif | 690 | #endif |
1677 | 691 | ||
1678 | #ifdef CONFIG_SMP | 692 | void update_cpu_load(struct rq *this_rq); |
1679 | /* Used instead of source_load when we know the type == 0 */ | ||
1680 | static unsigned long weighted_cpuload(const int cpu) | ||
1681 | { | ||
1682 | return cpu_rq(cpu)->load.weight; | ||
1683 | } | ||
1684 | |||
1685 | /* | ||
1686 | * Return a low guess at the load of a migration-source cpu weighted | ||
1687 | * according to the scheduling class and "nice" value. | ||
1688 | * | ||
1689 | * We want to under-estimate the load of migration sources, to | ||
1690 | * balance conservatively. | ||
1691 | */ | ||
1692 | static unsigned long source_load(int cpu, int type) | ||
1693 | { | ||
1694 | struct rq *rq = cpu_rq(cpu); | ||
1695 | unsigned long total = weighted_cpuload(cpu); | ||
1696 | |||
1697 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1698 | return total; | ||
1699 | |||
1700 | return min(rq->cpu_load[type-1], total); | ||
1701 | } | ||
1702 | |||
1703 | /* | ||
1704 | * Return a high guess at the load of a migration-target cpu weighted | ||
1705 | * according to the scheduling class and "nice" value. | ||
1706 | */ | ||
1707 | static unsigned long target_load(int cpu, int type) | ||
1708 | { | ||
1709 | struct rq *rq = cpu_rq(cpu); | ||
1710 | unsigned long total = weighted_cpuload(cpu); | ||
1711 | |||
1712 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1713 | return total; | ||
1714 | |||
1715 | return max(rq->cpu_load[type-1], total); | ||
1716 | } | ||
1717 | |||
1718 | static unsigned long power_of(int cpu) | ||
1719 | { | ||
1720 | return cpu_rq(cpu)->cpu_power; | ||
1721 | } | ||
1722 | |||
1723 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1724 | |||
1725 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1726 | { | ||
1727 | struct rq *rq = cpu_rq(cpu); | ||
1728 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
1729 | |||
1730 | if (nr_running) | ||
1731 | return rq->load.weight / nr_running; | ||
1732 | |||
1733 | return 0; | ||
1734 | } | ||
1735 | |||
1736 | #ifdef CONFIG_PREEMPT | ||
1737 | |||
1738 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1739 | |||
1740 | /* | ||
1741 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1742 | * way at the expense of forcing extra atomic operations in all | ||
1743 | * invocations. This assures that the double_lock is acquired using the | ||
1744 | * same underlying policy as the spinlock_t on this architecture, which | ||
1745 | * reduces latency compared to the unfair variant below. However, it | ||
1746 | * also adds more overhead and therefore may reduce throughput. | ||
1747 | */ | ||
1748 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1749 | __releases(this_rq->lock) | ||
1750 | __acquires(busiest->lock) | ||
1751 | __acquires(this_rq->lock) | ||
1752 | { | ||
1753 | raw_spin_unlock(&this_rq->lock); | ||
1754 | double_rq_lock(this_rq, busiest); | ||
1755 | |||
1756 | return 1; | ||
1757 | } | ||
1758 | |||
1759 | #else | ||
1760 | /* | ||
1761 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1762 | * latency by eliminating extra atomic operations when the locks are | ||
1763 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1764 | * grant the double lock to lower cpus over higher ids under contention, | ||
1765 | * regardless of entry order into the function. | ||
1766 | */ | ||
1767 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1768 | __releases(this_rq->lock) | ||
1769 | __acquires(busiest->lock) | ||
1770 | __acquires(this_rq->lock) | ||
1771 | { | ||
1772 | int ret = 0; | ||
1773 | |||
1774 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1775 | if (busiest < this_rq) { | ||
1776 | raw_spin_unlock(&this_rq->lock); | ||
1777 | raw_spin_lock(&busiest->lock); | ||
1778 | raw_spin_lock_nested(&this_rq->lock, | ||
1779 | SINGLE_DEPTH_NESTING); | ||
1780 | ret = 1; | ||
1781 | } else | ||
1782 | raw_spin_lock_nested(&busiest->lock, | ||
1783 | SINGLE_DEPTH_NESTING); | ||
1784 | } | ||
1785 | return ret; | ||
1786 | } | ||
1787 | |||
1788 | #endif /* CONFIG_PREEMPT */ | ||
1789 | |||
1790 | /* | ||
1791 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1792 | */ | ||
1793 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1794 | { | ||
1795 | if (unlikely(!irqs_disabled())) { | ||
1796 | /* printk() doesn't work good under rq->lock */ | ||
1797 | raw_spin_unlock(&this_rq->lock); | ||
1798 | BUG_ON(1); | ||
1799 | } | ||
1800 | |||
1801 | return _double_lock_balance(this_rq, busiest); | ||
1802 | } | ||
1803 | |||
1804 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1805 | __releases(busiest->lock) | ||
1806 | { | ||
1807 | raw_spin_unlock(&busiest->lock); | ||
1808 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1809 | } | ||
1810 | |||
1811 | /* | ||
1812 | * double_rq_lock - safely lock two runqueues | ||
1813 | * | ||
1814 | * Note this does not disable interrupts like task_rq_lock, | ||
1815 | * you need to do so manually before calling. | ||
1816 | */ | ||
1817 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1818 | __acquires(rq1->lock) | ||
1819 | __acquires(rq2->lock) | ||
1820 | { | ||
1821 | BUG_ON(!irqs_disabled()); | ||
1822 | if (rq1 == rq2) { | ||
1823 | raw_spin_lock(&rq1->lock); | ||
1824 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1825 | } else { | ||
1826 | if (rq1 < rq2) { | ||
1827 | raw_spin_lock(&rq1->lock); | ||
1828 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1829 | } else { | ||
1830 | raw_spin_lock(&rq2->lock); | ||
1831 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1832 | } | ||
1833 | } | ||
1834 | } | ||
1835 | |||
1836 | /* | ||
1837 | * double_rq_unlock - safely unlock two runqueues | ||
1838 | * | ||
1839 | * Note this does not restore interrupts like task_rq_unlock, | ||
1840 | * you need to do so manually after calling. | ||
1841 | */ | ||
1842 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1843 | __releases(rq1->lock) | ||
1844 | __releases(rq2->lock) | ||
1845 | { | ||
1846 | raw_spin_unlock(&rq1->lock); | ||
1847 | if (rq1 != rq2) | ||
1848 | raw_spin_unlock(&rq2->lock); | ||
1849 | else | ||
1850 | __release(rq2->lock); | ||
1851 | } | ||
1852 | |||
1853 | #else /* CONFIG_SMP */ | ||
1854 | |||
1855 | /* | ||
1856 | * double_rq_lock - safely lock two runqueues | ||
1857 | * | ||
1858 | * Note this does not disable interrupts like task_rq_lock, | ||
1859 | * you need to do so manually before calling. | ||
1860 | */ | ||
1861 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1862 | __acquires(rq1->lock) | ||
1863 | __acquires(rq2->lock) | ||
1864 | { | ||
1865 | BUG_ON(!irqs_disabled()); | ||
1866 | BUG_ON(rq1 != rq2); | ||
1867 | raw_spin_lock(&rq1->lock); | ||
1868 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1869 | } | ||
1870 | |||
1871 | /* | ||
1872 | * double_rq_unlock - safely unlock two runqueues | ||
1873 | * | ||
1874 | * Note this does not restore interrupts like task_rq_unlock, | ||
1875 | * you need to do so manually after calling. | ||
1876 | */ | ||
1877 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1878 | __releases(rq1->lock) | ||
1879 | __releases(rq2->lock) | ||
1880 | { | ||
1881 | BUG_ON(rq1 != rq2); | ||
1882 | raw_spin_unlock(&rq1->lock); | ||
1883 | __release(rq2->lock); | ||
1884 | } | ||
1885 | |||
1886 | #endif | ||
1887 | |||
1888 | static void calc_load_account_idle(struct rq *this_rq); | ||
1889 | static void update_sysctl(void); | ||
1890 | static int get_update_sysctl_factor(void); | ||
1891 | static void update_cpu_load(struct rq *this_rq); | ||
1892 | |||
1893 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
1894 | { | ||
1895 | set_task_rq(p, cpu); | ||
1896 | #ifdef CONFIG_SMP | ||
1897 | /* | ||
1898 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
1899 | * successfully executed on another CPU. We must ensure that updates of | ||
1900 | * per-task data have been completed by this moment. | ||
1901 | */ | ||
1902 | smp_wmb(); | ||
1903 | task_thread_info(p)->cpu = cpu; | ||
1904 | #endif | ||
1905 | } | ||
1906 | |||
1907 | static const struct sched_class rt_sched_class; | ||
1908 | |||
1909 | #define sched_class_highest (&stop_sched_class) | ||
1910 | #define for_each_class(class) \ | ||
1911 | for (class = sched_class_highest; class; class = class->next) | ||
1912 | |||
1913 | #include "sched_stats.h" | ||
1914 | |||
1915 | static void inc_nr_running(struct rq *rq) | ||
1916 | { | ||
1917 | rq->nr_running++; | ||
1918 | } | ||
1919 | |||
1920 | static void dec_nr_running(struct rq *rq) | ||
1921 | { | ||
1922 | rq->nr_running--; | ||
1923 | } | ||
1924 | 693 | ||
1925 | static void set_load_weight(struct task_struct *p) | 694 | static void set_load_weight(struct task_struct *p) |
1926 | { | 695 | { |
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1957 | /* | 726 | /* |
1958 | * activate_task - move a task to the runqueue. | 727 | * activate_task - move a task to the runqueue. |
1959 | */ | 728 | */ |
1960 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) | 729 | void activate_task(struct rq *rq, struct task_struct *p, int flags) |
1961 | { | 730 | { |
1962 | if (task_contributes_to_load(p)) | 731 | if (task_contributes_to_load(p)) |
1963 | rq->nr_uninterruptible--; | 732 | rq->nr_uninterruptible--; |
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1968 | /* | 737 | /* |
1969 | * deactivate_task - remove a task from the runqueue. | 738 | * deactivate_task - remove a task from the runqueue. |
1970 | */ | 739 | */ |
1971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | 740 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
1972 | { | 741 | { |
1973 | if (task_contributes_to_load(p)) | 742 | if (task_contributes_to_load(p)) |
1974 | rq->nr_uninterruptible++; | 743 | rq->nr_uninterruptible++; |
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
2159 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2160 | static int irqtime_account_hi_update(void) | 929 | static int irqtime_account_hi_update(void) |
2161 | { | 930 | { |
2162 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 931 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2163 | unsigned long flags; | 932 | unsigned long flags; |
2164 | u64 latest_ns; | 933 | u64 latest_ns; |
2165 | int ret = 0; | 934 | int ret = 0; |
2166 | 935 | ||
2167 | local_irq_save(flags); | 936 | local_irq_save(flags); |
2168 | latest_ns = this_cpu_read(cpu_hardirq_time); | 937 | latest_ns = this_cpu_read(cpu_hardirq_time); |
2169 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | 938 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) |
2170 | ret = 1; | 939 | ret = 1; |
2171 | local_irq_restore(flags); | 940 | local_irq_restore(flags); |
2172 | return ret; | 941 | return ret; |
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void) | |||
2174 | 943 | ||
2175 | static int irqtime_account_si_update(void) | 944 | static int irqtime_account_si_update(void) |
2176 | { | 945 | { |
2177 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 946 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2178 | unsigned long flags; | 947 | unsigned long flags; |
2179 | u64 latest_ns; | 948 | u64 latest_ns; |
2180 | int ret = 0; | 949 | int ret = 0; |
2181 | 950 | ||
2182 | local_irq_save(flags); | 951 | local_irq_save(flags); |
2183 | latest_ns = this_cpu_read(cpu_softirq_time); | 952 | latest_ns = this_cpu_read(cpu_softirq_time); |
2184 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | 953 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) |
2185 | ret = 1; | 954 | ret = 1; |
2186 | local_irq_restore(flags); | 955 | local_irq_restore(flags); |
2187 | return ret; | 956 | return ret; |
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void) | |||
2193 | 962 | ||
2194 | #endif | 963 | #endif |
2195 | 964 | ||
2196 | #include "sched_idletask.c" | ||
2197 | #include "sched_fair.c" | ||
2198 | #include "sched_rt.c" | ||
2199 | #include "sched_autogroup.c" | ||
2200 | #include "sched_stoptask.c" | ||
2201 | #ifdef CONFIG_SCHED_DEBUG | ||
2202 | # include "sched_debug.c" | ||
2203 | #endif | ||
2204 | |||
2205 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 965 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
2206 | { | 966 | { |
2207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 967 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2299 | p->sched_class->prio_changed(rq, p, oldprio); | 1059 | p->sched_class->prio_changed(rq, p, oldprio); |
2300 | } | 1060 | } |
2301 | 1061 | ||
2302 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 1062 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
2303 | { | 1063 | { |
2304 | const struct sched_class *class; | 1064 | const struct sched_class *class; |
2305 | 1065 | ||
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2325 | } | 1085 | } |
2326 | 1086 | ||
2327 | #ifdef CONFIG_SMP | 1087 | #ifdef CONFIG_SMP |
2328 | /* | ||
2329 | * Is this task likely cache-hot: | ||
2330 | */ | ||
2331 | static int | ||
2332 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
2333 | { | ||
2334 | s64 delta; | ||
2335 | |||
2336 | if (p->sched_class != &fair_sched_class) | ||
2337 | return 0; | ||
2338 | |||
2339 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2340 | return 0; | ||
2341 | |||
2342 | /* | ||
2343 | * Buddy candidates are cache hot: | ||
2344 | */ | ||
2345 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
2346 | (&p->se == cfs_rq_of(&p->se)->next || | ||
2347 | &p->se == cfs_rq_of(&p->se)->last)) | ||
2348 | return 1; | ||
2349 | |||
2350 | if (sysctl_sched_migration_cost == -1) | ||
2351 | return 1; | ||
2352 | if (sysctl_sched_migration_cost == 0) | ||
2353 | return 0; | ||
2354 | |||
2355 | delta = now - p->se.exec_start; | ||
2356 | |||
2357 | return delta < (s64)sysctl_sched_migration_cost; | ||
2358 | } | ||
2359 | |||
2360 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1088 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2361 | { | 1089 | { |
2362 | #ifdef CONFIG_SCHED_DEBUG | 1090 | #ifdef CONFIG_SCHED_DEBUG |
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | |||
2783 | 1511 | ||
2784 | } | 1512 | } |
2785 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1513 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1514 | |||
1515 | static inline int ttwu_share_cache(int this_cpu, int that_cpu) | ||
1516 | { | ||
1517 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | ||
1518 | } | ||
2786 | #endif /* CONFIG_SMP */ | 1519 | #endif /* CONFIG_SMP */ |
2787 | 1520 | ||
2788 | static void ttwu_queue(struct task_struct *p, int cpu) | 1521 | static void ttwu_queue(struct task_struct *p, int cpu) |
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
2790 | struct rq *rq = cpu_rq(cpu); | 1523 | struct rq *rq = cpu_rq(cpu); |
2791 | 1524 | ||
2792 | #if defined(CONFIG_SMP) | 1525 | #if defined(CONFIG_SMP) |
2793 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | 1526 | if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { |
2794 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1527 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
2795 | ttwu_queue_remote(p, cpu); | 1528 | ttwu_queue_remote(p, cpu); |
2796 | return; | 1529 | return; |
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
3204 | local_irq_enable(); | 1937 | local_irq_enable(); |
3205 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1938 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3206 | finish_lock_switch(rq, prev); | 1939 | finish_lock_switch(rq, prev); |
1940 | trace_sched_stat_sleeptime(current, rq->clock); | ||
3207 | 1941 | ||
3208 | fire_sched_in_preempt_notifiers(current); | 1942 | fire_sched_in_preempt_notifiers(current); |
3209 | if (mm) | 1943 | if (mm) |
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
3439 | */ | 2173 | */ |
3440 | static atomic_long_t calc_load_tasks_idle; | 2174 | static atomic_long_t calc_load_tasks_idle; |
3441 | 2175 | ||
3442 | static void calc_load_account_idle(struct rq *this_rq) | 2176 | void calc_load_account_idle(struct rq *this_rq) |
3443 | { | 2177 | { |
3444 | long delta; | 2178 | long delta; |
3445 | 2179 | ||
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks) | |||
3583 | */ | 2317 | */ |
3584 | } | 2318 | } |
3585 | #else | 2319 | #else |
3586 | static void calc_load_account_idle(struct rq *this_rq) | 2320 | void calc_load_account_idle(struct rq *this_rq) |
3587 | { | 2321 | { |
3588 | } | 2322 | } |
3589 | 2323 | ||
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
3726 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2460 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3727 | * every tick. We fix it up based on jiffies. | 2461 | * every tick. We fix it up based on jiffies. |
3728 | */ | 2462 | */ |
3729 | static void update_cpu_load(struct rq *this_rq) | 2463 | void update_cpu_load(struct rq *this_rq) |
3730 | { | 2464 | { |
3731 | unsigned long this_load = this_rq->load.weight; | 2465 | unsigned long this_load = this_rq->load.weight; |
3732 | unsigned long curr_jiffies = jiffies; | 2466 | unsigned long curr_jiffies = jiffies; |
@@ -3804,8 +2538,10 @@ unlock: | |||
3804 | #endif | 2538 | #endif |
3805 | 2539 | ||
3806 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 2540 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
2541 | DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); | ||
3807 | 2542 | ||
3808 | EXPORT_PER_CPU_SYMBOL(kstat); | 2543 | EXPORT_PER_CPU_SYMBOL(kstat); |
2544 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); | ||
3809 | 2545 | ||
3810 | /* | 2546 | /* |
3811 | * Return any ns on the sched_clock that have not yet been accounted in | 2547 | * Return any ns on the sched_clock that have not yet been accounted in |
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3858 | return ns; | 2594 | return ns; |
3859 | } | 2595 | } |
3860 | 2596 | ||
2597 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2598 | struct cgroup_subsys cpuacct_subsys; | ||
2599 | struct cpuacct root_cpuacct; | ||
2600 | #endif | ||
2601 | |||
2602 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
2603 | u64 tmp) | ||
2604 | { | ||
2605 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2606 | struct kernel_cpustat *kcpustat; | ||
2607 | struct cpuacct *ca; | ||
2608 | #endif | ||
2609 | /* | ||
2610 | * Since all updates are sure to touch the root cgroup, we | ||
2611 | * get ourselves ahead and touch it first. If the root cgroup | ||
2612 | * is the only cgroup, then nothing else should be necessary. | ||
2613 | * | ||
2614 | */ | ||
2615 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2616 | |||
2617 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2618 | if (unlikely(!cpuacct_subsys.active)) | ||
2619 | return; | ||
2620 | |||
2621 | rcu_read_lock(); | ||
2622 | ca = task_ca(p); | ||
2623 | while (ca && (ca != &root_cpuacct)) { | ||
2624 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2625 | kcpustat->cpustat[index] += tmp; | ||
2626 | ca = parent_ca(ca); | ||
2627 | } | ||
2628 | rcu_read_unlock(); | ||
2629 | #endif | ||
2630 | } | ||
2631 | |||
2632 | |||
3861 | /* | 2633 | /* |
3862 | * Account user cpu time to a process. | 2634 | * Account user cpu time to a process. |
3863 | * @p: the process that the cpu time gets accounted to | 2635 | * @p: the process that the cpu time gets accounted to |
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3867 | void account_user_time(struct task_struct *p, cputime_t cputime, | 2639 | void account_user_time(struct task_struct *p, cputime_t cputime, |
3868 | cputime_t cputime_scaled) | 2640 | cputime_t cputime_scaled) |
3869 | { | 2641 | { |
3870 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2642 | int index; |
3871 | cputime64_t tmp; | ||
3872 | 2643 | ||
3873 | /* Add user time to process. */ | 2644 | /* Add user time to process. */ |
3874 | p->utime = cputime_add(p->utime, cputime); | 2645 | p->utime += cputime; |
3875 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2646 | p->utimescaled += cputime_scaled; |
3876 | account_group_user_time(p, cputime); | 2647 | account_group_user_time(p, cputime); |
3877 | 2648 | ||
2649 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
2650 | |||
3878 | /* Add user time to cpustat. */ | 2651 | /* Add user time to cpustat. */ |
3879 | tmp = cputime_to_cputime64(cputime); | 2652 | task_group_account_field(p, index, (__force u64) cputime); |
3880 | if (TASK_NICE(p) > 0) | ||
3881 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | ||
3882 | else | ||
3883 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3884 | 2653 | ||
3885 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | ||
3886 | /* Account for user time used */ | 2654 | /* Account for user time used */ |
3887 | acct_update_integrals(p); | 2655 | acct_update_integrals(p); |
3888 | } | 2656 | } |
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
3896 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 2664 | static void account_guest_time(struct task_struct *p, cputime_t cputime, |
3897 | cputime_t cputime_scaled) | 2665 | cputime_t cputime_scaled) |
3898 | { | 2666 | { |
3899 | cputime64_t tmp; | 2667 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3900 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3901 | |||
3902 | tmp = cputime_to_cputime64(cputime); | ||
3903 | 2668 | ||
3904 | /* Add guest time to process. */ | 2669 | /* Add guest time to process. */ |
3905 | p->utime = cputime_add(p->utime, cputime); | 2670 | p->utime += cputime; |
3906 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2671 | p->utimescaled += cputime_scaled; |
3907 | account_group_user_time(p, cputime); | 2672 | account_group_user_time(p, cputime); |
3908 | p->gtime = cputime_add(p->gtime, cputime); | 2673 | p->gtime += cputime; |
3909 | 2674 | ||
3910 | /* Add guest time to cpustat. */ | 2675 | /* Add guest time to cpustat. */ |
3911 | if (TASK_NICE(p) > 0) { | 2676 | if (TASK_NICE(p) > 0) { |
3912 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 2677 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
3913 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | 2678 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
3914 | } else { | 2679 | } else { |
3915 | cpustat->user = cputime64_add(cpustat->user, tmp); | 2680 | cpustat[CPUTIME_USER] += (__force u64) cputime; |
3916 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 2681 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; |
3917 | } | 2682 | } |
3918 | } | 2683 | } |
3919 | 2684 | ||
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3926 | */ | 2691 | */ |
3927 | static inline | 2692 | static inline |
3928 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 2693 | void __account_system_time(struct task_struct *p, cputime_t cputime, |
3929 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | 2694 | cputime_t cputime_scaled, int index) |
3930 | { | 2695 | { |
3931 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3932 | |||
3933 | /* Add system time to process. */ | 2696 | /* Add system time to process. */ |
3934 | p->stime = cputime_add(p->stime, cputime); | 2697 | p->stime += cputime; |
3935 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | 2698 | p->stimescaled += cputime_scaled; |
3936 | account_group_system_time(p, cputime); | 2699 | account_group_system_time(p, cputime); |
3937 | 2700 | ||
3938 | /* Add system time to cpustat. */ | 2701 | /* Add system time to cpustat. */ |
3939 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | 2702 | task_group_account_field(p, index, (__force u64) cputime); |
3940 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3941 | 2703 | ||
3942 | /* Account for system time used */ | 2704 | /* Account for system time used */ |
3943 | acct_update_integrals(p); | 2705 | acct_update_integrals(p); |
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
3953 | void account_system_time(struct task_struct *p, int hardirq_offset, | 2715 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3954 | cputime_t cputime, cputime_t cputime_scaled) | 2716 | cputime_t cputime, cputime_t cputime_scaled) |
3955 | { | 2717 | { |
3956 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2718 | int index; |
3957 | cputime64_t *target_cputime64; | ||
3958 | 2719 | ||
3959 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 2720 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3960 | account_guest_time(p, cputime, cputime_scaled); | 2721 | account_guest_time(p, cputime, cputime_scaled); |
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3962 | } | 2723 | } |
3963 | 2724 | ||
3964 | if (hardirq_count() - hardirq_offset) | 2725 | if (hardirq_count() - hardirq_offset) |
3965 | target_cputime64 = &cpustat->irq; | 2726 | index = CPUTIME_IRQ; |
3966 | else if (in_serving_softirq()) | 2727 | else if (in_serving_softirq()) |
3967 | target_cputime64 = &cpustat->softirq; | 2728 | index = CPUTIME_SOFTIRQ; |
3968 | else | 2729 | else |
3969 | target_cputime64 = &cpustat->system; | 2730 | index = CPUTIME_SYSTEM; |
3970 | 2731 | ||
3971 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); | 2732 | __account_system_time(p, cputime, cputime_scaled, index); |
3972 | } | 2733 | } |
3973 | 2734 | ||
3974 | /* | 2735 | /* |
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3977 | */ | 2738 | */ |
3978 | void account_steal_time(cputime_t cputime) | 2739 | void account_steal_time(cputime_t cputime) |
3979 | { | 2740 | { |
3980 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2741 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3981 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3982 | 2742 | ||
3983 | cpustat->steal = cputime64_add(cpustat->steal, cputime64); | 2743 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; |
3984 | } | 2744 | } |
3985 | 2745 | ||
3986 | /* | 2746 | /* |
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime) | |||
3989 | */ | 2749 | */ |
3990 | void account_idle_time(cputime_t cputime) | 2750 | void account_idle_time(cputime_t cputime) |
3991 | { | 2751 | { |
3992 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2752 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3993 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3994 | struct rq *rq = this_rq(); | 2753 | struct rq *rq = this_rq(); |
3995 | 2754 | ||
3996 | if (atomic_read(&rq->nr_iowait) > 0) | 2755 | if (atomic_read(&rq->nr_iowait) > 0) |
3997 | cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); | 2756 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; |
3998 | else | 2757 | else |
3999 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 2758 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
4000 | } | 2759 | } |
4001 | 2760 | ||
4002 | static __always_inline bool steal_account_process_tick(void) | 2761 | static __always_inline bool steal_account_process_tick(void) |
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4046 | struct rq *rq) | 2805 | struct rq *rq) |
4047 | { | 2806 | { |
4048 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 2807 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
4049 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 2808 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
4050 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
4051 | 2809 | ||
4052 | if (steal_account_process_tick()) | 2810 | if (steal_account_process_tick()) |
4053 | return; | 2811 | return; |
4054 | 2812 | ||
4055 | if (irqtime_account_hi_update()) { | 2813 | if (irqtime_account_hi_update()) { |
4056 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 2814 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; |
4057 | } else if (irqtime_account_si_update()) { | 2815 | } else if (irqtime_account_si_update()) { |
4058 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 2816 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; |
4059 | } else if (this_cpu_ksoftirqd() == p) { | 2817 | } else if (this_cpu_ksoftirqd() == p) { |
4060 | /* | 2818 | /* |
4061 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 2819 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4063 | * Also, p->stime needs to be updated for ksoftirqd. | 2821 | * Also, p->stime needs to be updated for ksoftirqd. |
4064 | */ | 2822 | */ |
4065 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2823 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
4066 | &cpustat->softirq); | 2824 | CPUTIME_SOFTIRQ); |
4067 | } else if (user_tick) { | 2825 | } else if (user_tick) { |
4068 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2826 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
4069 | } else if (p == rq->idle) { | 2827 | } else if (p == rq->idle) { |
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4072 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2830 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); |
4073 | } else { | 2831 | } else { |
4074 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2832 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
4075 | &cpustat->system); | 2833 | CPUTIME_SYSTEM); |
4076 | } | 2834 | } |
4077 | } | 2835 | } |
4078 | 2836 | ||
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4171 | 2929 | ||
4172 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 2930 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
4173 | { | 2931 | { |
4174 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); | 2932 | cputime_t rtime, utime = p->utime, total = utime + p->stime; |
4175 | 2933 | ||
4176 | /* | 2934 | /* |
4177 | * Use CFS's precise accounting: | 2935 | * Use CFS's precise accounting: |
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4179 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 2937 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
4180 | 2938 | ||
4181 | if (total) { | 2939 | if (total) { |
4182 | u64 temp = rtime; | 2940 | u64 temp = (__force u64) rtime; |
4183 | 2941 | ||
4184 | temp *= utime; | 2942 | temp *= (__force u64) utime; |
4185 | do_div(temp, total); | 2943 | do_div(temp, (__force u32) total); |
4186 | utime = (cputime_t)temp; | 2944 | utime = (__force cputime_t) temp; |
4187 | } else | 2945 | } else |
4188 | utime = rtime; | 2946 | utime = rtime; |
4189 | 2947 | ||
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4191 | * Compare with previous values, to keep monotonicity: | 2949 | * Compare with previous values, to keep monotonicity: |
4192 | */ | 2950 | */ |
4193 | p->prev_utime = max(p->prev_utime, utime); | 2951 | p->prev_utime = max(p->prev_utime, utime); |
4194 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | 2952 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); |
4195 | 2953 | ||
4196 | *ut = p->prev_utime; | 2954 | *ut = p->prev_utime; |
4197 | *st = p->prev_stime; | 2955 | *st = p->prev_stime; |
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4208 | 2966 | ||
4209 | thread_group_cputime(p, &cputime); | 2967 | thread_group_cputime(p, &cputime); |
4210 | 2968 | ||
4211 | total = cputime_add(cputime.utime, cputime.stime); | 2969 | total = cputime.utime + cputime.stime; |
4212 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 2970 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
4213 | 2971 | ||
4214 | if (total) { | 2972 | if (total) { |
4215 | u64 temp = rtime; | 2973 | u64 temp = (__force u64) rtime; |
4216 | 2974 | ||
4217 | temp *= cputime.utime; | 2975 | temp *= (__force u64) cputime.utime; |
4218 | do_div(temp, total); | 2976 | do_div(temp, (__force u32) total); |
4219 | utime = (cputime_t)temp; | 2977 | utime = (__force cputime_t) temp; |
4220 | } else | 2978 | } else |
4221 | utime = rtime; | 2979 | utime = rtime; |
4222 | 2980 | ||
4223 | sig->prev_utime = max(sig->prev_utime, utime); | 2981 | sig->prev_utime = max(sig->prev_utime, utime); |
4224 | sig->prev_stime = max(sig->prev_stime, | 2982 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); |
4225 | cputime_sub(rtime, sig->prev_utime)); | ||
4226 | 2983 | ||
4227 | *ut = sig->prev_utime; | 2984 | *ut = sig->prev_utime; |
4228 | *st = sig->prev_stime; | 2985 | *st = sig->prev_stime; |
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4321 | { | 3078 | { |
4322 | struct pt_regs *regs = get_irq_regs(); | 3079 | struct pt_regs *regs = get_irq_regs(); |
4323 | 3080 | ||
3081 | if (oops_in_progress) | ||
3082 | return; | ||
3083 | |||
4324 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 3084 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
4325 | prev->comm, prev->pid, preempt_count()); | 3085 | prev->comm, prev->pid, preempt_count()); |
4326 | 3086 | ||
@@ -5852,6 +4612,13 @@ again: | |||
5852 | */ | 4612 | */ |
5853 | if (preempt && rq != p_rq) | 4613 | if (preempt && rq != p_rq) |
5854 | resched_task(p_rq->curr); | 4614 | resched_task(p_rq->curr); |
4615 | } else { | ||
4616 | /* | ||
4617 | * We might have set it in task_yield_fair(), but are | ||
4618 | * not going to schedule(), so don't want to skip | ||
4619 | * the next update. | ||
4620 | */ | ||
4621 | rq->skip_clock_update = 0; | ||
5855 | } | 4622 | } |
5856 | 4623 | ||
5857 | out: | 4624 | out: |
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p) | |||
6019 | free = stack_not_used(p); | 4786 | free = stack_not_used(p); |
6020 | #endif | 4787 | #endif |
6021 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4788 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
6022 | task_pid_nr(p), task_pid_nr(p->real_parent), | 4789 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), |
6023 | (unsigned long)task_thread_info(p)->flags); | 4790 | (unsigned long)task_thread_info(p)->flags); |
6024 | 4791 | ||
6025 | show_stack(p, NULL); | 4792 | show_stack(p, NULL); |
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
6118 | #endif | 4885 | #endif |
6119 | } | 4886 | } |
6120 | 4887 | ||
6121 | /* | ||
6122 | * Increase the granularity value when there are more CPUs, | ||
6123 | * because with more CPUs the 'effective latency' as visible | ||
6124 | * to users decreases. But the relationship is not linear, | ||
6125 | * so pick a second-best guess by going with the log2 of the | ||
6126 | * number of CPUs. | ||
6127 | * | ||
6128 | * This idea comes from the SD scheduler of Con Kolivas: | ||
6129 | */ | ||
6130 | static int get_update_sysctl_factor(void) | ||
6131 | { | ||
6132 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
6133 | unsigned int factor; | ||
6134 | |||
6135 | switch (sysctl_sched_tunable_scaling) { | ||
6136 | case SCHED_TUNABLESCALING_NONE: | ||
6137 | factor = 1; | ||
6138 | break; | ||
6139 | case SCHED_TUNABLESCALING_LINEAR: | ||
6140 | factor = cpus; | ||
6141 | break; | ||
6142 | case SCHED_TUNABLESCALING_LOG: | ||
6143 | default: | ||
6144 | factor = 1 + ilog2(cpus); | ||
6145 | break; | ||
6146 | } | ||
6147 | |||
6148 | return factor; | ||
6149 | } | ||
6150 | |||
6151 | static void update_sysctl(void) | ||
6152 | { | ||
6153 | unsigned int factor = get_update_sysctl_factor(); | ||
6154 | |||
6155 | #define SET_SYSCTL(name) \ | ||
6156 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
6157 | SET_SYSCTL(sched_min_granularity); | ||
6158 | SET_SYSCTL(sched_latency); | ||
6159 | SET_SYSCTL(sched_wakeup_granularity); | ||
6160 | #undef SET_SYSCTL | ||
6161 | } | ||
6162 | |||
6163 | static inline void sched_init_granularity(void) | ||
6164 | { | ||
6165 | update_sysctl(); | ||
6166 | } | ||
6167 | |||
6168 | #ifdef CONFIG_SMP | 4888 | #ifdef CONFIG_SMP |
6169 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4889 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
6170 | { | 4890 | { |
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq) | |||
6351 | rq->calc_load_active = 0; | 5071 | rq->calc_load_active = 0; |
6352 | } | 5072 | } |
6353 | 5073 | ||
6354 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6355 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6356 | { | ||
6357 | struct cfs_rq *cfs_rq; | ||
6358 | |||
6359 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6360 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6361 | |||
6362 | if (!cfs_rq->runtime_enabled) | ||
6363 | continue; | ||
6364 | |||
6365 | /* | ||
6366 | * clock_task is not advancing so we just need to make sure | ||
6367 | * there's some valid quota amount | ||
6368 | */ | ||
6369 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6370 | if (cfs_rq_throttled(cfs_rq)) | ||
6371 | unthrottle_cfs_rq(cfs_rq); | ||
6372 | } | ||
6373 | } | ||
6374 | #else | ||
6375 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6376 | #endif | ||
6377 | |||
6378 | /* | 5074 | /* |
6379 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 5075 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6380 | * try_to_wake_up()->select_task_rq(). | 5076 | * try_to_wake_up()->select_task_rq(). |
@@ -6980,6 +5676,12 @@ out: | |||
6980 | return -ENOMEM; | 5676 | return -ENOMEM; |
6981 | } | 5677 | } |
6982 | 5678 | ||
5679 | /* | ||
5680 | * By default the system creates a single root-domain with all cpus as | ||
5681 | * members (mimicking the global state we have today). | ||
5682 | */ | ||
5683 | struct root_domain def_root_domain; | ||
5684 | |||
6983 | static void init_defrootdomain(void) | 5685 | static void init_defrootdomain(void) |
6984 | { | 5686 | { |
6985 | init_rootdomain(&def_root_domain); | 5687 | init_rootdomain(&def_root_domain); |
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
7051 | } | 5753 | } |
7052 | 5754 | ||
7053 | /* | 5755 | /* |
5756 | * Keep a special pointer to the highest sched_domain that has | ||
5757 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5758 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5759 | * | ||
5760 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5761 | * the cpumask of the domain), this allows us to quickly tell if | ||
5762 | * two cpus are in the same cache domain, see ttwu_share_cache(). | ||
5763 | */ | ||
5764 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5765 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5766 | |||
5767 | static void update_top_cache_domain(int cpu) | ||
5768 | { | ||
5769 | struct sched_domain *sd; | ||
5770 | int id = cpu; | ||
5771 | |||
5772 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
5773 | if (sd) | ||
5774 | id = cpumask_first(sched_domain_span(sd)); | ||
5775 | |||
5776 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
5777 | per_cpu(sd_llc_id, cpu) = id; | ||
5778 | } | ||
5779 | |||
5780 | /* | ||
7054 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 5781 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
7055 | * hold the hotplug lock. | 5782 | * hold the hotplug lock. |
7056 | */ | 5783 | */ |
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
7089 | tmp = rq->sd; | 5816 | tmp = rq->sd; |
7090 | rcu_assign_pointer(rq->sd, sd); | 5817 | rcu_assign_pointer(rq->sd, sd); |
7091 | destroy_sched_domains(tmp, cpu); | 5818 | destroy_sched_domains(tmp, cpu); |
5819 | |||
5820 | update_top_cache_domain(cpu); | ||
7092 | } | 5821 | } |
7093 | 5822 | ||
7094 | /* cpus with isolated domains */ | 5823 | /* cpus with isolated domains */ |
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
7248 | continue; | 5977 | continue; |
7249 | 5978 | ||
7250 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5979 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
7251 | GFP_KERNEL, cpu_to_node(i)); | 5980 | GFP_KERNEL, cpu_to_node(cpu)); |
7252 | 5981 | ||
7253 | if (!sg) | 5982 | if (!sg) |
7254 | goto fail; | 5983 | goto fail; |
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7386 | return; | 6115 | return; |
7387 | 6116 | ||
7388 | update_group_power(sd, cpu); | 6117 | update_group_power(sd, cpu); |
6118 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | ||
6119 | } | ||
6120 | |||
6121 | int __weak arch_sd_sibling_asym_packing(void) | ||
6122 | { | ||
6123 | return 0*SD_ASYM_PACKING; | ||
7389 | } | 6124 | } |
7390 | 6125 | ||
7391 | /* | 6126 | /* |
@@ -8021,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | |||
8021 | } | 6756 | } |
8022 | } | 6757 | } |
8023 | 6758 | ||
8024 | static int update_runtime(struct notifier_block *nfb, | ||
8025 | unsigned long action, void *hcpu) | ||
8026 | { | ||
8027 | int cpu = (int)(long)hcpu; | ||
8028 | |||
8029 | switch (action) { | ||
8030 | case CPU_DOWN_PREPARE: | ||
8031 | case CPU_DOWN_PREPARE_FROZEN: | ||
8032 | disable_runtime(cpu_rq(cpu)); | ||
8033 | return NOTIFY_OK; | ||
8034 | |||
8035 | case CPU_DOWN_FAILED: | ||
8036 | case CPU_DOWN_FAILED_FROZEN: | ||
8037 | case CPU_ONLINE: | ||
8038 | case CPU_ONLINE_FROZEN: | ||
8039 | enable_runtime(cpu_rq(cpu)); | ||
8040 | return NOTIFY_OK; | ||
8041 | |||
8042 | default: | ||
8043 | return NOTIFY_DONE; | ||
8044 | } | ||
8045 | } | ||
8046 | |||
8047 | void __init sched_init_smp(void) | 6759 | void __init sched_init_smp(void) |
8048 | { | 6760 | { |
8049 | cpumask_var_t non_isolated_cpus; | 6761 | cpumask_var_t non_isolated_cpus; |
@@ -8092,104 +6804,11 @@ int in_sched_functions(unsigned long addr) | |||
8092 | && addr < (unsigned long)__sched_text_end); | 6804 | && addr < (unsigned long)__sched_text_end); |
8093 | } | 6805 | } |
8094 | 6806 | ||
8095 | static void init_cfs_rq(struct cfs_rq *cfs_rq) | 6807 | #ifdef CONFIG_CGROUP_SCHED |
8096 | { | 6808 | struct task_group root_task_group; |
8097 | cfs_rq->tasks_timeline = RB_ROOT; | ||
8098 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
8099 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
8100 | #ifndef CONFIG_64BIT | ||
8101 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
8102 | #endif | ||
8103 | } | ||
8104 | |||
8105 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
8106 | { | ||
8107 | struct rt_prio_array *array; | ||
8108 | int i; | ||
8109 | |||
8110 | array = &rt_rq->active; | ||
8111 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
8112 | INIT_LIST_HEAD(array->queue + i); | ||
8113 | __clear_bit(i, array->bitmap); | ||
8114 | } | ||
8115 | /* delimiter for bitsearch: */ | ||
8116 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
8117 | |||
8118 | #if defined CONFIG_SMP | ||
8119 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
8120 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
8121 | rt_rq->rt_nr_migratory = 0; | ||
8122 | rt_rq->overloaded = 0; | ||
8123 | plist_head_init(&rt_rq->pushable_tasks); | ||
8124 | #endif | ||
8125 | |||
8126 | rt_rq->rt_time = 0; | ||
8127 | rt_rq->rt_throttled = 0; | ||
8128 | rt_rq->rt_runtime = 0; | ||
8129 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
8130 | } | ||
8131 | |||
8132 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8133 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
8134 | struct sched_entity *se, int cpu, | ||
8135 | struct sched_entity *parent) | ||
8136 | { | ||
8137 | struct rq *rq = cpu_rq(cpu); | ||
8138 | |||
8139 | cfs_rq->tg = tg; | ||
8140 | cfs_rq->rq = rq; | ||
8141 | #ifdef CONFIG_SMP | ||
8142 | /* allow initial update_cfs_load() to truncate */ | ||
8143 | cfs_rq->load_stamp = 1; | ||
8144 | #endif | ||
8145 | init_cfs_rq_runtime(cfs_rq); | ||
8146 | |||
8147 | tg->cfs_rq[cpu] = cfs_rq; | ||
8148 | tg->se[cpu] = se; | ||
8149 | |||
8150 | /* se could be NULL for root_task_group */ | ||
8151 | if (!se) | ||
8152 | return; | ||
8153 | |||
8154 | if (!parent) | ||
8155 | se->cfs_rq = &rq->cfs; | ||
8156 | else | ||
8157 | se->cfs_rq = parent->my_q; | ||
8158 | |||
8159 | se->my_q = cfs_rq; | ||
8160 | update_load_set(&se->load, 0); | ||
8161 | se->parent = parent; | ||
8162 | } | ||
8163 | #endif | 6809 | #endif |
8164 | 6810 | ||
8165 | #ifdef CONFIG_RT_GROUP_SCHED | 6811 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
8166 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
8167 | struct sched_rt_entity *rt_se, int cpu, | ||
8168 | struct sched_rt_entity *parent) | ||
8169 | { | ||
8170 | struct rq *rq = cpu_rq(cpu); | ||
8171 | |||
8172 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
8173 | rt_rq->rt_nr_boosted = 0; | ||
8174 | rt_rq->rq = rq; | ||
8175 | rt_rq->tg = tg; | ||
8176 | |||
8177 | tg->rt_rq[cpu] = rt_rq; | ||
8178 | tg->rt_se[cpu] = rt_se; | ||
8179 | |||
8180 | if (!rt_se) | ||
8181 | return; | ||
8182 | |||
8183 | if (!parent) | ||
8184 | rt_se->rt_rq = &rq->rt; | ||
8185 | else | ||
8186 | rt_se->rt_rq = parent->my_q; | ||
8187 | |||
8188 | rt_se->my_q = rt_rq; | ||
8189 | rt_se->parent = parent; | ||
8190 | INIT_LIST_HEAD(&rt_se->run_list); | ||
8191 | } | ||
8192 | #endif | ||
8193 | 6812 | ||
8194 | void __init sched_init(void) | 6813 | void __init sched_init(void) |
8195 | { | 6814 | { |
@@ -8247,9 +6866,17 @@ void __init sched_init(void) | |||
8247 | #ifdef CONFIG_CGROUP_SCHED | 6866 | #ifdef CONFIG_CGROUP_SCHED |
8248 | list_add(&root_task_group.list, &task_groups); | 6867 | list_add(&root_task_group.list, &task_groups); |
8249 | INIT_LIST_HEAD(&root_task_group.children); | 6868 | INIT_LIST_HEAD(&root_task_group.children); |
6869 | INIT_LIST_HEAD(&root_task_group.siblings); | ||
8250 | autogroup_init(&init_task); | 6870 | autogroup_init(&init_task); |
6871 | |||
8251 | #endif /* CONFIG_CGROUP_SCHED */ | 6872 | #endif /* CONFIG_CGROUP_SCHED */ |
8252 | 6873 | ||
6874 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6875 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6876 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6877 | /* Too early, not expected to fail */ | ||
6878 | BUG_ON(!root_cpuacct.cpuusage); | ||
6879 | #endif | ||
8253 | for_each_possible_cpu(i) { | 6880 | for_each_possible_cpu(i) { |
8254 | struct rq *rq; | 6881 | struct rq *rq; |
8255 | 6882 | ||
@@ -8261,7 +6888,7 @@ void __init sched_init(void) | |||
8261 | init_cfs_rq(&rq->cfs); | 6888 | init_cfs_rq(&rq->cfs); |
8262 | init_rt_rq(&rq->rt, rq); | 6889 | init_rt_rq(&rq->rt, rq); |
8263 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6890 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8264 | root_task_group.shares = root_task_group_load; | 6891 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
8265 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6892 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
8266 | /* | 6893 | /* |
8267 | * How much cpu bandwidth does root_task_group get? | 6894 | * How much cpu bandwidth does root_task_group get? |
@@ -8311,7 +6938,7 @@ void __init sched_init(void) | |||
8311 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6938 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
8312 | rq_attach_root(rq, &def_root_domain); | 6939 | rq_attach_root(rq, &def_root_domain); |
8313 | #ifdef CONFIG_NO_HZ | 6940 | #ifdef CONFIG_NO_HZ |
8314 | rq->nohz_balance_kick = 0; | 6941 | rq->nohz_flags = 0; |
8315 | #endif | 6942 | #endif |
8316 | #endif | 6943 | #endif |
8317 | init_rq_hrtick(rq); | 6944 | init_rq_hrtick(rq); |
@@ -8324,10 +6951,6 @@ void __init sched_init(void) | |||
8324 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6951 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
8325 | #endif | 6952 | #endif |
8326 | 6953 | ||
8327 | #ifdef CONFIG_SMP | ||
8328 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
8329 | #endif | ||
8330 | |||
8331 | #ifdef CONFIG_RT_MUTEXES | 6954 | #ifdef CONFIG_RT_MUTEXES |
8332 | plist_head_init(&init_task.pi_waiters); | 6955 | plist_head_init(&init_task.pi_waiters); |
8333 | #endif | 6956 | #endif |
@@ -8355,17 +6978,11 @@ void __init sched_init(void) | |||
8355 | 6978 | ||
8356 | #ifdef CONFIG_SMP | 6979 | #ifdef CONFIG_SMP |
8357 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 6980 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8358 | #ifdef CONFIG_NO_HZ | ||
8359 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
8360 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | ||
8361 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
8362 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
8363 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
8364 | #endif | ||
8365 | /* May be allocated at isolcpus cmdline parse time */ | 6981 | /* May be allocated at isolcpus cmdline parse time */ |
8366 | if (cpu_isolated_map == NULL) | 6982 | if (cpu_isolated_map == NULL) |
8367 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 6983 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8368 | #endif /* SMP */ | 6984 | #endif |
6985 | init_sched_fair_class(); | ||
8369 | 6986 | ||
8370 | scheduler_running = 1; | 6987 | scheduler_running = 1; |
8371 | } | 6988 | } |
@@ -8517,169 +7134,14 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
8517 | 7134 | ||
8518 | #endif | 7135 | #endif |
8519 | 7136 | ||
8520 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8521 | static void free_fair_sched_group(struct task_group *tg) | ||
8522 | { | ||
8523 | int i; | ||
8524 | |||
8525 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8526 | |||
8527 | for_each_possible_cpu(i) { | ||
8528 | if (tg->cfs_rq) | ||
8529 | kfree(tg->cfs_rq[i]); | ||
8530 | if (tg->se) | ||
8531 | kfree(tg->se[i]); | ||
8532 | } | ||
8533 | |||
8534 | kfree(tg->cfs_rq); | ||
8535 | kfree(tg->se); | ||
8536 | } | ||
8537 | |||
8538 | static | ||
8539 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8540 | { | ||
8541 | struct cfs_rq *cfs_rq; | ||
8542 | struct sched_entity *se; | ||
8543 | int i; | ||
8544 | |||
8545 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8546 | if (!tg->cfs_rq) | ||
8547 | goto err; | ||
8548 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
8549 | if (!tg->se) | ||
8550 | goto err; | ||
8551 | |||
8552 | tg->shares = NICE_0_LOAD; | ||
8553 | |||
8554 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8555 | |||
8556 | for_each_possible_cpu(i) { | ||
8557 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
8558 | GFP_KERNEL, cpu_to_node(i)); | ||
8559 | if (!cfs_rq) | ||
8560 | goto err; | ||
8561 | |||
8562 | se = kzalloc_node(sizeof(struct sched_entity), | ||
8563 | GFP_KERNEL, cpu_to_node(i)); | ||
8564 | if (!se) | ||
8565 | goto err_free_rq; | ||
8566 | |||
8567 | init_cfs_rq(cfs_rq); | ||
8568 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
8569 | } | ||
8570 | |||
8571 | return 1; | ||
8572 | |||
8573 | err_free_rq: | ||
8574 | kfree(cfs_rq); | ||
8575 | err: | ||
8576 | return 0; | ||
8577 | } | ||
8578 | |||
8579 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8580 | { | ||
8581 | struct rq *rq = cpu_rq(cpu); | ||
8582 | unsigned long flags; | ||
8583 | |||
8584 | /* | ||
8585 | * Only empty task groups can be destroyed; so we can speculatively | ||
8586 | * check on_list without danger of it being re-added. | ||
8587 | */ | ||
8588 | if (!tg->cfs_rq[cpu]->on_list) | ||
8589 | return; | ||
8590 | |||
8591 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8592 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8593 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8594 | } | ||
8595 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | ||
8596 | static inline void free_fair_sched_group(struct task_group *tg) | ||
8597 | { | ||
8598 | } | ||
8599 | |||
8600 | static inline | ||
8601 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8602 | { | ||
8603 | return 1; | ||
8604 | } | ||
8605 | |||
8606 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8607 | { | ||
8608 | } | ||
8609 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
8610 | |||
8611 | #ifdef CONFIG_RT_GROUP_SCHED | 7137 | #ifdef CONFIG_RT_GROUP_SCHED |
8612 | static void free_rt_sched_group(struct task_group *tg) | ||
8613 | { | ||
8614 | int i; | ||
8615 | |||
8616 | if (tg->rt_se) | ||
8617 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8618 | |||
8619 | for_each_possible_cpu(i) { | ||
8620 | if (tg->rt_rq) | ||
8621 | kfree(tg->rt_rq[i]); | ||
8622 | if (tg->rt_se) | ||
8623 | kfree(tg->rt_se[i]); | ||
8624 | } | ||
8625 | |||
8626 | kfree(tg->rt_rq); | ||
8627 | kfree(tg->rt_se); | ||
8628 | } | ||
8629 | |||
8630 | static | ||
8631 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8632 | { | ||
8633 | struct rt_rq *rt_rq; | ||
8634 | struct sched_rt_entity *rt_se; | ||
8635 | int i; | ||
8636 | |||
8637 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8638 | if (!tg->rt_rq) | ||
8639 | goto err; | ||
8640 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
8641 | if (!tg->rt_se) | ||
8642 | goto err; | ||
8643 | |||
8644 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
8645 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
8646 | |||
8647 | for_each_possible_cpu(i) { | ||
8648 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
8649 | GFP_KERNEL, cpu_to_node(i)); | ||
8650 | if (!rt_rq) | ||
8651 | goto err; | ||
8652 | |||
8653 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
8654 | GFP_KERNEL, cpu_to_node(i)); | ||
8655 | if (!rt_se) | ||
8656 | goto err_free_rq; | ||
8657 | |||
8658 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
8659 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8660 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
8661 | } | ||
8662 | |||
8663 | return 1; | ||
8664 | |||
8665 | err_free_rq: | ||
8666 | kfree(rt_rq); | ||
8667 | err: | ||
8668 | return 0; | ||
8669 | } | ||
8670 | #else /* !CONFIG_RT_GROUP_SCHED */ | 7138 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8671 | static inline void free_rt_sched_group(struct task_group *tg) | ||
8672 | { | ||
8673 | } | ||
8674 | |||
8675 | static inline | ||
8676 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8677 | { | ||
8678 | return 1; | ||
8679 | } | ||
8680 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7139 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8681 | 7140 | ||
8682 | #ifdef CONFIG_CGROUP_SCHED | 7141 | #ifdef CONFIG_CGROUP_SCHED |
7142 | /* task_group_lock serializes the addition/removal of task groups */ | ||
7143 | static DEFINE_SPINLOCK(task_group_lock); | ||
7144 | |||
8683 | static void free_sched_group(struct task_group *tg) | 7145 | static void free_sched_group(struct task_group *tg) |
8684 | { | 7146 | { |
8685 | free_fair_sched_group(tg); | 7147 | free_fair_sched_group(tg); |
@@ -8785,47 +7247,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8785 | #endif /* CONFIG_CGROUP_SCHED */ | 7247 | #endif /* CONFIG_CGROUP_SCHED */ |
8786 | 7248 | ||
8787 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7249 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8788 | static DEFINE_MUTEX(shares_mutex); | ||
8789 | |||
8790 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
8791 | { | ||
8792 | int i; | ||
8793 | unsigned long flags; | ||
8794 | |||
8795 | /* | ||
8796 | * We can't change the weight of the root cgroup. | ||
8797 | */ | ||
8798 | if (!tg->se[0]) | ||
8799 | return -EINVAL; | ||
8800 | |||
8801 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
8802 | |||
8803 | mutex_lock(&shares_mutex); | ||
8804 | if (tg->shares == shares) | ||
8805 | goto done; | ||
8806 | |||
8807 | tg->shares = shares; | ||
8808 | for_each_possible_cpu(i) { | ||
8809 | struct rq *rq = cpu_rq(i); | ||
8810 | struct sched_entity *se; | ||
8811 | |||
8812 | se = tg->se[i]; | ||
8813 | /* Propagate contribution to hierarchy */ | ||
8814 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8815 | for_each_sched_entity(se) | ||
8816 | update_cfs_shares(group_cfs_rq(se)); | ||
8817 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8818 | } | ||
8819 | |||
8820 | done: | ||
8821 | mutex_unlock(&shares_mutex); | ||
8822 | return 0; | ||
8823 | } | ||
8824 | |||
8825 | unsigned long sched_group_shares(struct task_group *tg) | ||
8826 | { | ||
8827 | return tg->shares; | ||
8828 | } | ||
8829 | #endif | 7250 | #endif |
8830 | 7251 | ||
8831 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | 7252 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
@@ -8850,7 +7271,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
8850 | struct task_struct *g, *p; | 7271 | struct task_struct *g, *p; |
8851 | 7272 | ||
8852 | do_each_thread(g, p) { | 7273 | do_each_thread(g, p) { |
8853 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 7274 | if (rt_task(p) && task_rq(p)->rt.tg == tg) |
8854 | return 1; | 7275 | return 1; |
8855 | } while_each_thread(g, p); | 7276 | } while_each_thread(g, p); |
8856 | 7277 | ||
@@ -9201,8 +7622,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | |||
9201 | 7622 | ||
9202 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | 7623 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) |
9203 | { | 7624 | { |
9204 | int i, ret = 0, runtime_enabled; | 7625 | int i, ret = 0, runtime_enabled, runtime_was_enabled; |
9205 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7626 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9206 | 7627 | ||
9207 | if (tg == &root_task_group) | 7628 | if (tg == &root_task_group) |
9208 | return -EINVAL; | 7629 | return -EINVAL; |
@@ -9229,6 +7650,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
9229 | goto out_unlock; | 7650 | goto out_unlock; |
9230 | 7651 | ||
9231 | runtime_enabled = quota != RUNTIME_INF; | 7652 | runtime_enabled = quota != RUNTIME_INF; |
7653 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | ||
7654 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | ||
9232 | raw_spin_lock_irq(&cfs_b->lock); | 7655 | raw_spin_lock_irq(&cfs_b->lock); |
9233 | cfs_b->period = ns_to_ktime(period); | 7656 | cfs_b->period = ns_to_ktime(period); |
9234 | cfs_b->quota = quota; | 7657 | cfs_b->quota = quota; |
@@ -9244,13 +7667,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
9244 | 7667 | ||
9245 | for_each_possible_cpu(i) { | 7668 | for_each_possible_cpu(i) { |
9246 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7669 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
9247 | struct rq *rq = rq_of(cfs_rq); | 7670 | struct rq *rq = cfs_rq->rq; |
9248 | 7671 | ||
9249 | raw_spin_lock_irq(&rq->lock); | 7672 | raw_spin_lock_irq(&rq->lock); |
9250 | cfs_rq->runtime_enabled = runtime_enabled; | 7673 | cfs_rq->runtime_enabled = runtime_enabled; |
9251 | cfs_rq->runtime_remaining = 0; | 7674 | cfs_rq->runtime_remaining = 0; |
9252 | 7675 | ||
9253 | if (cfs_rq_throttled(cfs_rq)) | 7676 | if (cfs_rq->throttled) |
9254 | unthrottle_cfs_rq(cfs_rq); | 7677 | unthrottle_cfs_rq(cfs_rq); |
9255 | raw_spin_unlock_irq(&rq->lock); | 7678 | raw_spin_unlock_irq(&rq->lock); |
9256 | } | 7679 | } |
@@ -9264,7 +7687,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | |||
9264 | { | 7687 | { |
9265 | u64 quota, period; | 7688 | u64 quota, period; |
9266 | 7689 | ||
9267 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7690 | period = ktime_to_ns(tg->cfs_bandwidth.period); |
9268 | if (cfs_quota_us < 0) | 7691 | if (cfs_quota_us < 0) |
9269 | quota = RUNTIME_INF; | 7692 | quota = RUNTIME_INF; |
9270 | else | 7693 | else |
@@ -9277,10 +7700,10 @@ long tg_get_cfs_quota(struct task_group *tg) | |||
9277 | { | 7700 | { |
9278 | u64 quota_us; | 7701 | u64 quota_us; |
9279 | 7702 | ||
9280 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | 7703 | if (tg->cfs_bandwidth.quota == RUNTIME_INF) |
9281 | return -1; | 7704 | return -1; |
9282 | 7705 | ||
9283 | quota_us = tg_cfs_bandwidth(tg)->quota; | 7706 | quota_us = tg->cfs_bandwidth.quota; |
9284 | do_div(quota_us, NSEC_PER_USEC); | 7707 | do_div(quota_us, NSEC_PER_USEC); |
9285 | 7708 | ||
9286 | return quota_us; | 7709 | return quota_us; |
@@ -9291,10 +7714,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | |||
9291 | u64 quota, period; | 7714 | u64 quota, period; |
9292 | 7715 | ||
9293 | period = (u64)cfs_period_us * NSEC_PER_USEC; | 7716 | period = (u64)cfs_period_us * NSEC_PER_USEC; |
9294 | quota = tg_cfs_bandwidth(tg)->quota; | 7717 | quota = tg->cfs_bandwidth.quota; |
9295 | |||
9296 | if (period <= 0) | ||
9297 | return -EINVAL; | ||
9298 | 7718 | ||
9299 | return tg_set_cfs_bandwidth(tg, period, quota); | 7719 | return tg_set_cfs_bandwidth(tg, period, quota); |
9300 | } | 7720 | } |
@@ -9303,7 +7723,7 @@ long tg_get_cfs_period(struct task_group *tg) | |||
9303 | { | 7723 | { |
9304 | u64 cfs_period_us; | 7724 | u64 cfs_period_us; |
9305 | 7725 | ||
9306 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7726 | cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); |
9307 | do_div(cfs_period_us, NSEC_PER_USEC); | 7727 | do_div(cfs_period_us, NSEC_PER_USEC); |
9308 | 7728 | ||
9309 | return cfs_period_us; | 7729 | return cfs_period_us; |
@@ -9363,13 +7783,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, | |||
9363 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | 7783 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) |
9364 | { | 7784 | { |
9365 | struct cfs_schedulable_data *d = data; | 7785 | struct cfs_schedulable_data *d = data; |
9366 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7786 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9367 | s64 quota = 0, parent_quota = -1; | 7787 | s64 quota = 0, parent_quota = -1; |
9368 | 7788 | ||
9369 | if (!tg->parent) { | 7789 | if (!tg->parent) { |
9370 | quota = RUNTIME_INF; | 7790 | quota = RUNTIME_INF; |
9371 | } else { | 7791 | } else { |
9372 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | 7792 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
9373 | 7793 | ||
9374 | quota = normalize_cfs_quota(tg, d); | 7794 | quota = normalize_cfs_quota(tg, d); |
9375 | parent_quota = parent_b->hierarchal_quota; | 7795 | parent_quota = parent_b->hierarchal_quota; |
@@ -9413,7 +7833,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
9413 | struct cgroup_map_cb *cb) | 7833 | struct cgroup_map_cb *cb) |
9414 | { | 7834 | { |
9415 | struct task_group *tg = cgroup_tg(cgrp); | 7835 | struct task_group *tg = cgroup_tg(cgrp); |
9416 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7836 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9417 | 7837 | ||
9418 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7838 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
9419 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | 7839 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); |
@@ -9514,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9514 | * (balbir@in.ibm.com). | 7934 | * (balbir@in.ibm.com). |
9515 | */ | 7935 | */ |
9516 | 7936 | ||
9517 | /* track cpu usage of a group of tasks and its child groups */ | ||
9518 | struct cpuacct { | ||
9519 | struct cgroup_subsys_state css; | ||
9520 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
9521 | u64 __percpu *cpuusage; | ||
9522 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | ||
9523 | struct cpuacct *parent; | ||
9524 | }; | ||
9525 | |||
9526 | struct cgroup_subsys cpuacct_subsys; | ||
9527 | |||
9528 | /* return cpu accounting group corresponding to this container */ | ||
9529 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
9530 | { | ||
9531 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
9532 | struct cpuacct, css); | ||
9533 | } | ||
9534 | |||
9535 | /* return cpu accounting group to which this task belongs */ | ||
9536 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
9537 | { | ||
9538 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
9539 | struct cpuacct, css); | ||
9540 | } | ||
9541 | |||
9542 | /* create a new cpu accounting group */ | 7937 | /* create a new cpu accounting group */ |
9543 | static struct cgroup_subsys_state *cpuacct_create( | 7938 | static struct cgroup_subsys_state *cpuacct_create( |
9544 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 7939 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
9545 | { | 7940 | { |
9546 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7941 | struct cpuacct *ca; |
9547 | int i; | ||
9548 | 7942 | ||
7943 | if (!cgrp->parent) | ||
7944 | return &root_cpuacct.css; | ||
7945 | |||
7946 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
9549 | if (!ca) | 7947 | if (!ca) |
9550 | goto out; | 7948 | goto out; |
9551 | 7949 | ||
@@ -9553,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
9553 | if (!ca->cpuusage) | 7951 | if (!ca->cpuusage) |
9554 | goto out_free_ca; | 7952 | goto out_free_ca; |
9555 | 7953 | ||
9556 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7954 | ca->cpustat = alloc_percpu(struct kernel_cpustat); |
9557 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 7955 | if (!ca->cpustat) |
9558 | goto out_free_counters; | 7956 | goto out_free_cpuusage; |
9559 | |||
9560 | if (cgrp->parent) | ||
9561 | ca->parent = cgroup_ca(cgrp->parent); | ||
9562 | 7957 | ||
9563 | return &ca->css; | 7958 | return &ca->css; |
9564 | 7959 | ||
9565 | out_free_counters: | 7960 | out_free_cpuusage: |
9566 | while (--i >= 0) | ||
9567 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9568 | free_percpu(ca->cpuusage); | 7961 | free_percpu(ca->cpuusage); |
9569 | out_free_ca: | 7962 | out_free_ca: |
9570 | kfree(ca); | 7963 | kfree(ca); |
@@ -9577,10 +7970,8 @@ static void | |||
9577 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7970 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
9578 | { | 7971 | { |
9579 | struct cpuacct *ca = cgroup_ca(cgrp); | 7972 | struct cpuacct *ca = cgroup_ca(cgrp); |
9580 | int i; | ||
9581 | 7973 | ||
9582 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7974 | free_percpu(ca->cpustat); |
9583 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9584 | free_percpu(ca->cpuusage); | 7975 | free_percpu(ca->cpuusage); |
9585 | kfree(ca); | 7976 | kfree(ca); |
9586 | } | 7977 | } |
@@ -9673,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = { | |||
9673 | }; | 8064 | }; |
9674 | 8065 | ||
9675 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 8066 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
9676 | struct cgroup_map_cb *cb) | 8067 | struct cgroup_map_cb *cb) |
9677 | { | 8068 | { |
9678 | struct cpuacct *ca = cgroup_ca(cgrp); | 8069 | struct cpuacct *ca = cgroup_ca(cgrp); |
9679 | int i; | 8070 | int cpu; |
8071 | s64 val = 0; | ||
8072 | |||
8073 | for_each_online_cpu(cpu) { | ||
8074 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8075 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
8076 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
8077 | } | ||
8078 | val = cputime64_to_clock_t(val); | ||
8079 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
9680 | 8080 | ||
9681 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 8081 | val = 0; |
9682 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 8082 | for_each_online_cpu(cpu) { |
9683 | val = cputime64_to_clock_t(val); | 8083 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); |
9684 | cb->fill(cb, cpuacct_stat_desc[i], val); | 8084 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; |
8085 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8086 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
9685 | } | 8087 | } |
8088 | |||
8089 | val = cputime64_to_clock_t(val); | ||
8090 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8091 | |||
9686 | return 0; | 8092 | return 0; |
9687 | } | 8093 | } |
9688 | 8094 | ||
@@ -9712,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9712 | * | 8118 | * |
9713 | * called with rq->lock held. | 8119 | * called with rq->lock held. |
9714 | */ | 8120 | */ |
9715 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 8121 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
9716 | { | 8122 | { |
9717 | struct cpuacct *ca; | 8123 | struct cpuacct *ca; |
9718 | int cpu; | 8124 | int cpu; |
@@ -9726,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9726 | 8132 | ||
9727 | ca = task_ca(tsk); | 8133 | ca = task_ca(tsk); |
9728 | 8134 | ||
9729 | for (; ca; ca = ca->parent) { | 8135 | for (; ca; ca = parent_ca(ca)) { |
9730 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 8136 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9731 | *cpuusage += cputime; | 8137 | *cpuusage += cputime; |
9732 | } | 8138 | } |
@@ -9734,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9734 | rcu_read_unlock(); | 8140 | rcu_read_unlock(); |
9735 | } | 8141 | } |
9736 | 8142 | ||
9737 | /* | ||
9738 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9739 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9740 | * percpu_counter_add with values large enough to always overflow the | ||
9741 | * per cpu batch limit causing bad SMP scalability. | ||
9742 | * | ||
9743 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9744 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9745 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9746 | */ | ||
9747 | #ifdef CONFIG_SMP | ||
9748 | #define CPUACCT_BATCH \ | ||
9749 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9750 | #else | ||
9751 | #define CPUACCT_BATCH 0 | ||
9752 | #endif | ||
9753 | |||
9754 | /* | ||
9755 | * Charge the system/user time to the task's accounting group. | ||
9756 | */ | ||
9757 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
9758 | enum cpuacct_stat_index idx, cputime_t val) | ||
9759 | { | ||
9760 | struct cpuacct *ca; | ||
9761 | int batch = CPUACCT_BATCH; | ||
9762 | |||
9763 | if (unlikely(!cpuacct_subsys.active)) | ||
9764 | return; | ||
9765 | |||
9766 | rcu_read_lock(); | ||
9767 | ca = task_ca(tsk); | ||
9768 | |||
9769 | do { | ||
9770 | __percpu_counter_add(&ca->cpustat[idx], val, batch); | ||
9771 | ca = ca->parent; | ||
9772 | } while (ca); | ||
9773 | rcu_read_unlock(); | ||
9774 | } | ||
9775 | |||
9776 | struct cgroup_subsys cpuacct_subsys = { | 8143 | struct cgroup_subsys cpuacct_subsys = { |
9777 | .name = "cpuacct", | 8144 | .name = "cpuacct", |
9778 | .create = cpuacct_create, | 8145 | .create = cpuacct_create, |
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index a86cf9d9eb11..b0d798eaf130 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched_cpupri.c | 2 | * kernel/sched/cpupri.c |
3 | * | 3 | * |
4 | * CPU priority management | 4 | * CPU priority management |
5 | * | 5 | * |
@@ -28,7 +28,7 @@ | |||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
31 | #include "sched_cpupri.h" | 31 | #include "cpupri.h" |
32 | 32 | ||
33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
34 | static int convert_prio(int prio) | 34 | static int convert_prio(int prio) |
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index f6d756173491..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h | |||
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4f..2a075e10004b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/time/sched_debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree |
5 | * | 5 | * |
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | #include "sched.h" | ||
20 | |||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | 21 | static DEFINE_SPINLOCK(sched_debug_lock); |
20 | 22 | ||
21 | /* | 23 | /* |
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
373 | return 0; | 375 | return 0; |
374 | } | 376 | } |
375 | 377 | ||
376 | static void sysrq_sched_debug_show(void) | 378 | void sysrq_sched_debug_show(void) |
377 | { | 379 | { |
378 | sched_debug_show(NULL, NULL); | 380 | sched_debug_show(NULL, NULL); |
379 | } | 381 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index 8a39fa3e3c6c..8e42de9105f8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c | |||
@@ -23,6 +23,13 @@ | |||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/slab.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | |||
30 | #include <trace/events/sched.h> | ||
31 | |||
32 | #include "sched.h" | ||
26 | 33 | ||
27 | /* | 34 | /* |
28 | * Targeted preemption latency for CPU-bound tasks: | 35 | * Targeted preemption latency for CPU-bound tasks: |
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 110 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
104 | #endif | 111 | #endif |
105 | 112 | ||
106 | static const struct sched_class fair_sched_class; | 113 | /* |
114 | * Increase the granularity value when there are more CPUs, | ||
115 | * because with more CPUs the 'effective latency' as visible | ||
116 | * to users decreases. But the relationship is not linear, | ||
117 | * so pick a second-best guess by going with the log2 of the | ||
118 | * number of CPUs. | ||
119 | * | ||
120 | * This idea comes from the SD scheduler of Con Kolivas: | ||
121 | */ | ||
122 | static int get_update_sysctl_factor(void) | ||
123 | { | ||
124 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
125 | unsigned int factor; | ||
126 | |||
127 | switch (sysctl_sched_tunable_scaling) { | ||
128 | case SCHED_TUNABLESCALING_NONE: | ||
129 | factor = 1; | ||
130 | break; | ||
131 | case SCHED_TUNABLESCALING_LINEAR: | ||
132 | factor = cpus; | ||
133 | break; | ||
134 | case SCHED_TUNABLESCALING_LOG: | ||
135 | default: | ||
136 | factor = 1 + ilog2(cpus); | ||
137 | break; | ||
138 | } | ||
139 | |||
140 | return factor; | ||
141 | } | ||
142 | |||
143 | static void update_sysctl(void) | ||
144 | { | ||
145 | unsigned int factor = get_update_sysctl_factor(); | ||
146 | |||
147 | #define SET_SYSCTL(name) \ | ||
148 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
149 | SET_SYSCTL(sched_min_granularity); | ||
150 | SET_SYSCTL(sched_latency); | ||
151 | SET_SYSCTL(sched_wakeup_granularity); | ||
152 | #undef SET_SYSCTL | ||
153 | } | ||
154 | |||
155 | void sched_init_granularity(void) | ||
156 | { | ||
157 | update_sysctl(); | ||
158 | } | ||
159 | |||
160 | #if BITS_PER_LONG == 32 | ||
161 | # define WMULT_CONST (~0UL) | ||
162 | #else | ||
163 | # define WMULT_CONST (1UL << 32) | ||
164 | #endif | ||
165 | |||
166 | #define WMULT_SHIFT 32 | ||
167 | |||
168 | /* | ||
169 | * Shift right and round: | ||
170 | */ | ||
171 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
172 | |||
173 | /* | ||
174 | * delta *= weight / lw | ||
175 | */ | ||
176 | static unsigned long | ||
177 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
178 | struct load_weight *lw) | ||
179 | { | ||
180 | u64 tmp; | ||
181 | |||
182 | /* | ||
183 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
184 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
185 | * 2^SCHED_LOAD_RESOLUTION. | ||
186 | */ | ||
187 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
188 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
189 | else | ||
190 | tmp = (u64)delta_exec; | ||
191 | |||
192 | if (!lw->inv_weight) { | ||
193 | unsigned long w = scale_load_down(lw->weight); | ||
194 | |||
195 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
196 | lw->inv_weight = 1; | ||
197 | else if (unlikely(!w)) | ||
198 | lw->inv_weight = WMULT_CONST; | ||
199 | else | ||
200 | lw->inv_weight = WMULT_CONST / w; | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Check whether we'd overflow the 64-bit multiplication: | ||
205 | */ | ||
206 | if (unlikely(tmp > WMULT_CONST)) | ||
207 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
208 | WMULT_SHIFT/2); | ||
209 | else | ||
210 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
211 | |||
212 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
213 | } | ||
214 | |||
215 | |||
216 | const struct sched_class fair_sched_class; | ||
107 | 217 | ||
108 | /************************************************************** | 218 | /************************************************************** |
109 | * CFS operations on generic schedulable entities: | 219 | * CFS operations on generic schedulable entities: |
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
413 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 523 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
414 | } | 524 | } |
415 | 525 | ||
416 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) | 526 | struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
417 | { | 527 | { |
418 | struct rb_node *left = cfs_rq->rb_leftmost; | 528 | struct rb_node *left = cfs_rq->rb_leftmost; |
419 | 529 | ||
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) | |||
434 | } | 544 | } |
435 | 545 | ||
436 | #ifdef CONFIG_SCHED_DEBUG | 546 | #ifdef CONFIG_SCHED_DEBUG |
437 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 547 | struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
438 | { | 548 | { |
439 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 549 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
440 | 550 | ||
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
684 | { | 794 | { |
685 | update_load_add(&cfs_rq->load, se->load.weight); | 795 | update_load_add(&cfs_rq->load, se->load.weight); |
686 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
687 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
688 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) { |
689 | add_cfs_task_weight(cfs_rq, se->load.weight); | 799 | add_cfs_task_weight(cfs_rq, se->load.weight); |
690 | list_add(&se->group_node, &cfs_rq->tasks); | 800 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
697 | { | 807 | { |
698 | update_load_sub(&cfs_rq->load, se->load.weight); | 808 | update_load_sub(&cfs_rq->load, se->load.weight); |
699 | if (!parent_entity(se)) | 809 | if (!parent_entity(se)) |
700 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
701 | if (entity_is_task(se)) { | 811 | if (entity_is_task(se)) { |
702 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 812 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
703 | list_del_init(&se->group_node); | 813 | list_del_init(&se->group_node); |
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
893 | if (unlikely(delta > se->statistics.sleep_max)) | 1003 | if (unlikely(delta > se->statistics.sleep_max)) |
894 | se->statistics.sleep_max = delta; | 1004 | se->statistics.sleep_max = delta; |
895 | 1005 | ||
896 | se->statistics.sleep_start = 0; | ||
897 | se->statistics.sum_sleep_runtime += delta; | 1006 | se->statistics.sum_sleep_runtime += delta; |
898 | 1007 | ||
899 | if (tsk) { | 1008 | if (tsk) { |
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
910 | if (unlikely(delta > se->statistics.block_max)) | 1019 | if (unlikely(delta > se->statistics.block_max)) |
911 | se->statistics.block_max = delta; | 1020 | se->statistics.block_max = delta; |
912 | 1021 | ||
913 | se->statistics.block_start = 0; | ||
914 | se->statistics.sum_sleep_runtime += delta; | 1022 | se->statistics.sum_sleep_runtime += delta; |
915 | 1023 | ||
916 | if (tsk) { | 1024 | if (tsk) { |
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
920 | trace_sched_stat_iowait(tsk, delta); | 1028 | trace_sched_stat_iowait(tsk, delta); |
921 | } | 1029 | } |
922 | 1030 | ||
1031 | trace_sched_stat_blocked(tsk, delta); | ||
1032 | |||
923 | /* | 1033 | /* |
924 | * Blocking time is in units of nanosecs, so shift by | 1034 | * Blocking time is in units of nanosecs, so shift by |
925 | * 20 to get a milliseconds-range estimation of the | 1035 | * 20 to get a milliseconds-range estimation of the |
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1287 | */ | 1397 | */ |
1288 | 1398 | ||
1289 | #ifdef CONFIG_CFS_BANDWIDTH | 1399 | #ifdef CONFIG_CFS_BANDWIDTH |
1400 | |||
1401 | #ifdef HAVE_JUMP_LABEL | ||
1402 | static struct jump_label_key __cfs_bandwidth_used; | ||
1403 | |||
1404 | static inline bool cfs_bandwidth_used(void) | ||
1405 | { | ||
1406 | return static_branch(&__cfs_bandwidth_used); | ||
1407 | } | ||
1408 | |||
1409 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | ||
1410 | { | ||
1411 | /* only need to count groups transitioning between enabled/!enabled */ | ||
1412 | if (enabled && !was_enabled) | ||
1413 | jump_label_inc(&__cfs_bandwidth_used); | ||
1414 | else if (!enabled && was_enabled) | ||
1415 | jump_label_dec(&__cfs_bandwidth_used); | ||
1416 | } | ||
1417 | #else /* HAVE_JUMP_LABEL */ | ||
1418 | static bool cfs_bandwidth_used(void) | ||
1419 | { | ||
1420 | return true; | ||
1421 | } | ||
1422 | |||
1423 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | ||
1424 | #endif /* HAVE_JUMP_LABEL */ | ||
1425 | |||
1290 | /* | 1426 | /* |
1291 | * default period for cfs group bandwidth. | 1427 | * default period for cfs group bandwidth. |
1292 | * default: 0.1s, units: nanoseconds | 1428 | * default: 0.1s, units: nanoseconds |
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void) | |||
1308 | * | 1444 | * |
1309 | * requires cfs_b->lock | 1445 | * requires cfs_b->lock |
1310 | */ | 1446 | */ |
1311 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | 1447 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) |
1312 | { | 1448 | { |
1313 | u64 now; | 1449 | u64 now; |
1314 | 1450 | ||
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | |||
1320 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | 1456 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); |
1321 | } | 1457 | } |
1322 | 1458 | ||
1459 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
1460 | { | ||
1461 | return &tg->cfs_bandwidth; | ||
1462 | } | ||
1463 | |||
1323 | /* returns 0 on failure to allocate runtime */ | 1464 | /* returns 0 on failure to allocate runtime */ |
1324 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1465 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1325 | { | 1466 | { |
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1421 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 1562 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
1422 | unsigned long delta_exec) | 1563 | unsigned long delta_exec) |
1423 | { | 1564 | { |
1424 | if (!cfs_rq->runtime_enabled) | 1565 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
1425 | return; | 1566 | return; |
1426 | 1567 | ||
1427 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | 1568 | __account_cfs_rq_runtime(cfs_rq, delta_exec); |
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1429 | 1570 | ||
1430 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 1571 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
1431 | { | 1572 | { |
1432 | return cfs_rq->throttled; | 1573 | return cfs_bandwidth_used() && cfs_rq->throttled; |
1433 | } | 1574 | } |
1434 | 1575 | ||
1435 | /* check whether cfs_rq, or any parent, is throttled */ | 1576 | /* check whether cfs_rq, or any parent, is throttled */ |
1436 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | 1577 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) |
1437 | { | 1578 | { |
1438 | return cfs_rq->throttle_count; | 1579 | return cfs_bandwidth_used() && cfs_rq->throttle_count; |
1439 | } | 1580 | } |
1440 | 1581 | ||
1441 | /* | 1582 | /* |
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1530 | raw_spin_unlock(&cfs_b->lock); | 1671 | raw_spin_unlock(&cfs_b->lock); |
1531 | } | 1672 | } |
1532 | 1673 | ||
1533 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | 1674 | void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) |
1534 | { | 1675 | { |
1535 | struct rq *rq = rq_of(cfs_rq); | 1676 | struct rq *rq = rq_of(cfs_rq); |
1536 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | 1677 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); |
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
1756 | 1897 | ||
1757 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1898 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1758 | { | 1899 | { |
1900 | if (!cfs_bandwidth_used()) | ||
1901 | return; | ||
1902 | |||
1759 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) | 1903 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) |
1760 | return; | 1904 | return; |
1761 | 1905 | ||
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
1801 | */ | 1945 | */ |
1802 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | 1946 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) |
1803 | { | 1947 | { |
1948 | if (!cfs_bandwidth_used()) | ||
1949 | return; | ||
1950 | |||
1804 | /* an active group must be handled by the update_curr()->put() path */ | 1951 | /* an active group must be handled by the update_curr()->put() path */ |
1805 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 1952 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
1806 | return; | 1953 | return; |
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
1818 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 1965 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
1819 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1966 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1820 | { | 1967 | { |
1968 | if (!cfs_bandwidth_used()) | ||
1969 | return; | ||
1970 | |||
1821 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | 1971 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) |
1822 | return; | 1972 | return; |
1823 | 1973 | ||
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
1830 | 1980 | ||
1831 | throttle_cfs_rq(cfs_rq); | 1981 | throttle_cfs_rq(cfs_rq); |
1832 | } | 1982 | } |
1833 | #else | 1983 | |
1984 | static inline u64 default_cfs_period(void); | ||
1985 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
1986 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
1987 | |||
1988 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
1989 | { | ||
1990 | struct cfs_bandwidth *cfs_b = | ||
1991 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
1992 | do_sched_cfs_slack_timer(cfs_b); | ||
1993 | |||
1994 | return HRTIMER_NORESTART; | ||
1995 | } | ||
1996 | |||
1997 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
1998 | { | ||
1999 | struct cfs_bandwidth *cfs_b = | ||
2000 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
2001 | ktime_t now; | ||
2002 | int overrun; | ||
2003 | int idle = 0; | ||
2004 | |||
2005 | for (;;) { | ||
2006 | now = hrtimer_cb_get_time(timer); | ||
2007 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
2008 | |||
2009 | if (!overrun) | ||
2010 | break; | ||
2011 | |||
2012 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
2013 | } | ||
2014 | |||
2015 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
2016 | } | ||
2017 | |||
2018 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2019 | { | ||
2020 | raw_spin_lock_init(&cfs_b->lock); | ||
2021 | cfs_b->runtime = 0; | ||
2022 | cfs_b->quota = RUNTIME_INF; | ||
2023 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
2024 | |||
2025 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
2026 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2027 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
2028 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2029 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
2030 | } | ||
2031 | |||
2032 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
2033 | { | ||
2034 | cfs_rq->runtime_enabled = 0; | ||
2035 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
2036 | } | ||
2037 | |||
2038 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
2039 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2040 | { | ||
2041 | /* | ||
2042 | * The timer may be active because we're trying to set a new bandwidth | ||
2043 | * period or because we're racing with the tear-down path | ||
2044 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
2045 | * terminates). In either case we ensure that it's re-programmed | ||
2046 | */ | ||
2047 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
2048 | raw_spin_unlock(&cfs_b->lock); | ||
2049 | /* ensure cfs_b->lock is available while we wait */ | ||
2050 | hrtimer_cancel(&cfs_b->period_timer); | ||
2051 | |||
2052 | raw_spin_lock(&cfs_b->lock); | ||
2053 | /* if someone else restarted the timer then we're done */ | ||
2054 | if (cfs_b->timer_active) | ||
2055 | return; | ||
2056 | } | ||
2057 | |||
2058 | cfs_b->timer_active = 1; | ||
2059 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
2060 | } | ||
2061 | |||
2062 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2063 | { | ||
2064 | hrtimer_cancel(&cfs_b->period_timer); | ||
2065 | hrtimer_cancel(&cfs_b->slack_timer); | ||
2066 | } | ||
2067 | |||
2068 | void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
2069 | { | ||
2070 | struct cfs_rq *cfs_rq; | ||
2071 | |||
2072 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
2073 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
2074 | |||
2075 | if (!cfs_rq->runtime_enabled) | ||
2076 | continue; | ||
2077 | |||
2078 | /* | ||
2079 | * clock_task is not advancing so we just need to make sure | ||
2080 | * there's some valid quota amount | ||
2081 | */ | ||
2082 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
2083 | if (cfs_rq_throttled(cfs_rq)) | ||
2084 | unthrottle_cfs_rq(cfs_rq); | ||
2085 | } | ||
2086 | } | ||
2087 | |||
2088 | #else /* CONFIG_CFS_BANDWIDTH */ | ||
1834 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2089 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
1835 | unsigned long delta_exec) {} | 2090 | unsigned long delta_exec) {} |
1836 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2091 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg, | |||
1852 | { | 2107 | { |
1853 | return 0; | 2108 | return 0; |
1854 | } | 2109 | } |
2110 | |||
2111 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2112 | |||
2113 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
2114 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1855 | #endif | 2115 | #endif |
1856 | 2116 | ||
2117 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
2118 | { | ||
2119 | return NULL; | ||
2120 | } | ||
2121 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2122 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
2123 | |||
2124 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
2125 | |||
1857 | /************************************************** | 2126 | /************************************************** |
1858 | * CFS operations on tasks: | 2127 | * CFS operations on tasks: |
1859 | */ | 2128 | */ |
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
1866 | 2135 | ||
1867 | WARN_ON(task_rq(p) != rq); | 2136 | WARN_ON(task_rq(p) != rq); |
1868 | 2137 | ||
1869 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | 2138 | if (cfs_rq->nr_running > 1) { |
1870 | u64 slice = sched_slice(cfs_rq, se); | 2139 | u64 slice = sched_slice(cfs_rq, se); |
1871 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 2140 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
1872 | s64 delta = slice - ran; | 2141 | s64 delta = slice - ran; |
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq) | |||
1897 | { | 2166 | { |
1898 | struct task_struct *curr = rq->curr; | 2167 | struct task_struct *curr = rq->curr; |
1899 | 2168 | ||
1900 | if (curr->sched_class != &fair_sched_class) | 2169 | if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) |
1901 | return; | 2170 | return; |
1902 | 2171 | ||
1903 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) | 2172 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) |
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2020 | } | 2289 | } |
2021 | 2290 | ||
2022 | #ifdef CONFIG_SMP | 2291 | #ifdef CONFIG_SMP |
2292 | /* Used instead of source_load when we know the type == 0 */ | ||
2293 | static unsigned long weighted_cpuload(const int cpu) | ||
2294 | { | ||
2295 | return cpu_rq(cpu)->load.weight; | ||
2296 | } | ||
2297 | |||
2298 | /* | ||
2299 | * Return a low guess at the load of a migration-source cpu weighted | ||
2300 | * according to the scheduling class and "nice" value. | ||
2301 | * | ||
2302 | * We want to under-estimate the load of migration sources, to | ||
2303 | * balance conservatively. | ||
2304 | */ | ||
2305 | static unsigned long source_load(int cpu, int type) | ||
2306 | { | ||
2307 | struct rq *rq = cpu_rq(cpu); | ||
2308 | unsigned long total = weighted_cpuload(cpu); | ||
2309 | |||
2310 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2311 | return total; | ||
2312 | |||
2313 | return min(rq->cpu_load[type-1], total); | ||
2314 | } | ||
2315 | |||
2316 | /* | ||
2317 | * Return a high guess at the load of a migration-target cpu weighted | ||
2318 | * according to the scheduling class and "nice" value. | ||
2319 | */ | ||
2320 | static unsigned long target_load(int cpu, int type) | ||
2321 | { | ||
2322 | struct rq *rq = cpu_rq(cpu); | ||
2323 | unsigned long total = weighted_cpuload(cpu); | ||
2324 | |||
2325 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2326 | return total; | ||
2327 | |||
2328 | return max(rq->cpu_load[type-1], total); | ||
2329 | } | ||
2330 | |||
2331 | static unsigned long power_of(int cpu) | ||
2332 | { | ||
2333 | return cpu_rq(cpu)->cpu_power; | ||
2334 | } | ||
2335 | |||
2336 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
2337 | { | ||
2338 | struct rq *rq = cpu_rq(cpu); | ||
2339 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
2340 | |||
2341 | if (nr_running) | ||
2342 | return rq->load.weight / nr_running; | ||
2343 | |||
2344 | return 0; | ||
2345 | } | ||
2346 | |||
2023 | 2347 | ||
2024 | static void task_waking_fair(struct task_struct *p) | 2348 | static void task_waking_fair(struct task_struct *p) |
2025 | { | 2349 | { |
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2327 | int prev_cpu = task_cpu(p); | 2651 | int prev_cpu = task_cpu(p); |
2328 | struct sched_domain *sd; | 2652 | struct sched_domain *sd; |
2329 | struct sched_group *sg; | 2653 | struct sched_group *sg; |
2330 | int i, smt = 0; | 2654 | int i; |
2331 | 2655 | ||
2332 | /* | 2656 | /* |
2333 | * If the task is going to be woken-up on this cpu and if it is | 2657 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2347 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2671 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2348 | */ | 2672 | */ |
2349 | rcu_read_lock(); | 2673 | rcu_read_lock(); |
2350 | again: | ||
2351 | for_each_domain(target, sd) { | ||
2352 | if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) | ||
2353 | continue; | ||
2354 | |||
2355 | if (smt && !(sd->flags & SD_SHARE_CPUPOWER)) | ||
2356 | break; | ||
2357 | |||
2358 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | ||
2359 | break; | ||
2360 | 2674 | ||
2675 | sd = rcu_dereference(per_cpu(sd_llc, target)); | ||
2676 | for_each_lower_domain(sd) { | ||
2361 | sg = sd->groups; | 2677 | sg = sd->groups; |
2362 | do { | 2678 | do { |
2363 | if (!cpumask_intersects(sched_group_cpus(sg), | 2679 | if (!cpumask_intersects(sched_group_cpus(sg), |
@@ -2376,10 +2692,6 @@ next: | |||
2376 | sg = sg->next; | 2692 | sg = sg->next; |
2377 | } while (sg != sd->groups); | 2693 | } while (sg != sd->groups); |
2378 | } | 2694 | } |
2379 | if (!smt) { | ||
2380 | smt = 1; | ||
2381 | goto again; | ||
2382 | } | ||
2383 | done: | 2695 | done: |
2384 | rcu_read_unlock(); | 2696 | rcu_read_unlock(); |
2385 | 2697 | ||
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2408 | int want_sd = 1; | 2720 | int want_sd = 1; |
2409 | int sync = wake_flags & WF_SYNC; | 2721 | int sync = wake_flags & WF_SYNC; |
2410 | 2722 | ||
2723 | if (p->rt.nr_cpus_allowed == 1) | ||
2724 | return prev_cpu; | ||
2725 | |||
2411 | if (sd_flag & SD_BALANCE_WAKE) { | 2726 | if (sd_flag & SD_BALANCE_WAKE) { |
2412 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 2727 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
2413 | want_affine = 1; | 2728 | want_affine = 1; |
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
2692 | } while (cfs_rq); | 3007 | } while (cfs_rq); |
2693 | 3008 | ||
2694 | p = task_of(se); | 3009 | p = task_of(se); |
2695 | hrtick_start_fair(rq, p); | 3010 | if (hrtick_enabled(rq)) |
3011 | hrtick_start_fair(rq, p); | ||
2696 | 3012 | ||
2697 | return p; | 3013 | return p; |
2698 | } | 3014 | } |
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq) | |||
2736 | * Update run-time statistics of the 'current'. | 3052 | * Update run-time statistics of the 'current'. |
2737 | */ | 3053 | */ |
2738 | update_curr(cfs_rq); | 3054 | update_curr(cfs_rq); |
3055 | /* | ||
3056 | * Tell update_rq_clock() that we've just updated, | ||
3057 | * so we don't do microscopic update in schedule() | ||
3058 | * and double the fastpath cost. | ||
3059 | */ | ||
3060 | rq->skip_clock_update = 1; | ||
2739 | } | 3061 | } |
2740 | 3062 | ||
2741 | set_skip_buddy(se); | 3063 | set_skip_buddy(se); |
@@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2776 | } | 3098 | } |
2777 | 3099 | ||
2778 | /* | 3100 | /* |
3101 | * Is this task likely cache-hot: | ||
3102 | */ | ||
3103 | static int | ||
3104 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
3105 | { | ||
3106 | s64 delta; | ||
3107 | |||
3108 | if (p->sched_class != &fair_sched_class) | ||
3109 | return 0; | ||
3110 | |||
3111 | if (unlikely(p->policy == SCHED_IDLE)) | ||
3112 | return 0; | ||
3113 | |||
3114 | /* | ||
3115 | * Buddy candidates are cache hot: | ||
3116 | */ | ||
3117 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
3118 | (&p->se == cfs_rq_of(&p->se)->next || | ||
3119 | &p->se == cfs_rq_of(&p->se)->last)) | ||
3120 | return 1; | ||
3121 | |||
3122 | if (sysctl_sched_migration_cost == -1) | ||
3123 | return 1; | ||
3124 | if (sysctl_sched_migration_cost == 0) | ||
3125 | return 0; | ||
3126 | |||
3127 | delta = now - p->se.exec_start; | ||
3128 | |||
3129 | return delta < (s64)sysctl_sched_migration_cost; | ||
3130 | } | ||
3131 | |||
3132 | #define LBF_ALL_PINNED 0x01 | ||
3133 | #define LBF_NEED_BREAK 0x02 | ||
3134 | #define LBF_ABORT 0x04 | ||
3135 | |||
3136 | /* | ||
2779 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3137 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2780 | */ | 3138 | */ |
2781 | static | 3139 | static |
2782 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3140 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2783 | struct sched_domain *sd, enum cpu_idle_type idle, | 3141 | struct sched_domain *sd, enum cpu_idle_type idle, |
2784 | int *all_pinned) | 3142 | int *lb_flags) |
2785 | { | 3143 | { |
2786 | int tsk_cache_hot = 0; | 3144 | int tsk_cache_hot = 0; |
2787 | /* | 3145 | /* |
@@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2794 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3152 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
2795 | return 0; | 3153 | return 0; |
2796 | } | 3154 | } |
2797 | *all_pinned = 0; | 3155 | *lb_flags &= ~LBF_ALL_PINNED; |
2798 | 3156 | ||
2799 | if (task_running(rq, p)) { | 3157 | if (task_running(rq, p)) { |
2800 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3158 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
@@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2868 | static unsigned long | 3226 | static unsigned long |
2869 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3227 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2870 | unsigned long max_load_move, struct sched_domain *sd, | 3228 | unsigned long max_load_move, struct sched_domain *sd, |
2871 | enum cpu_idle_type idle, int *all_pinned, | 3229 | enum cpu_idle_type idle, int *lb_flags, |
2872 | struct cfs_rq *busiest_cfs_rq) | 3230 | struct cfs_rq *busiest_cfs_rq) |
2873 | { | 3231 | { |
2874 | int loops = 0, pulled = 0; | 3232 | int loops = 0, pulled = 0; |
@@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2879 | goto out; | 3237 | goto out; |
2880 | 3238 | ||
2881 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3239 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
2882 | if (loops++ > sysctl_sched_nr_migrate) | 3240 | if (loops++ > sysctl_sched_nr_migrate) { |
3241 | *lb_flags |= LBF_NEED_BREAK; | ||
2883 | break; | 3242 | break; |
3243 | } | ||
2884 | 3244 | ||
2885 | if ((p->se.load.weight >> 1) > rem_load_move || | 3245 | if ((p->se.load.weight >> 1) > rem_load_move || |
2886 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3246 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2887 | all_pinned)) | 3247 | lb_flags)) |
2888 | continue; | 3248 | continue; |
2889 | 3249 | ||
2890 | pull_task(busiest, p, this_rq, this_cpu); | 3250 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2897 | * kernels will stop after the first task is pulled to minimize | 3257 | * kernels will stop after the first task is pulled to minimize |
2898 | * the critical section. | 3258 | * the critical section. |
2899 | */ | 3259 | */ |
2900 | if (idle == CPU_NEWLY_IDLE) | 3260 | if (idle == CPU_NEWLY_IDLE) { |
3261 | *lb_flags |= LBF_ABORT; | ||
2901 | break; | 3262 | break; |
3263 | } | ||
2902 | #endif | 3264 | #endif |
2903 | 3265 | ||
2904 | /* | 3266 | /* |
@@ -3003,7 +3365,7 @@ static unsigned long | |||
3003 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3365 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
3004 | unsigned long max_load_move, | 3366 | unsigned long max_load_move, |
3005 | struct sched_domain *sd, enum cpu_idle_type idle, | 3367 | struct sched_domain *sd, enum cpu_idle_type idle, |
3006 | int *all_pinned) | 3368 | int *lb_flags) |
3007 | { | 3369 | { |
3008 | long rem_load_move = max_load_move; | 3370 | long rem_load_move = max_load_move; |
3009 | struct cfs_rq *busiest_cfs_rq; | 3371 | struct cfs_rq *busiest_cfs_rq; |
@@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3016 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 3378 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
3017 | u64 rem_load, moved_load; | 3379 | u64 rem_load, moved_load; |
3018 | 3380 | ||
3381 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3382 | break; | ||
3383 | |||
3019 | /* | 3384 | /* |
3020 | * empty group or part of a throttled hierarchy | 3385 | * empty group or part of a throttled hierarchy |
3021 | */ | 3386 | */ |
@@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3027 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 3392 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
3028 | 3393 | ||
3029 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 3394 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
3030 | rem_load, sd, idle, all_pinned, | 3395 | rem_load, sd, idle, lb_flags, |
3031 | busiest_cfs_rq); | 3396 | busiest_cfs_rq); |
3032 | 3397 | ||
3033 | if (!moved_load) | 3398 | if (!moved_load) |
@@ -3053,10 +3418,10 @@ static unsigned long | |||
3053 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3418 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
3054 | unsigned long max_load_move, | 3419 | unsigned long max_load_move, |
3055 | struct sched_domain *sd, enum cpu_idle_type idle, | 3420 | struct sched_domain *sd, enum cpu_idle_type idle, |
3056 | int *all_pinned) | 3421 | int *lb_flags) |
3057 | { | 3422 | { |
3058 | return balance_tasks(this_rq, this_cpu, busiest, | 3423 | return balance_tasks(this_rq, this_cpu, busiest, |
3059 | max_load_move, sd, idle, all_pinned, | 3424 | max_load_move, sd, idle, lb_flags, |
3060 | &busiest->cfs); | 3425 | &busiest->cfs); |
3061 | } | 3426 | } |
3062 | #endif | 3427 | #endif |
@@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3071 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3436 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
3072 | unsigned long max_load_move, | 3437 | unsigned long max_load_move, |
3073 | struct sched_domain *sd, enum cpu_idle_type idle, | 3438 | struct sched_domain *sd, enum cpu_idle_type idle, |
3074 | int *all_pinned) | 3439 | int *lb_flags) |
3075 | { | 3440 | { |
3076 | unsigned long total_load_moved = 0, load_moved; | 3441 | unsigned long total_load_moved = 0, load_moved; |
3077 | 3442 | ||
3078 | do { | 3443 | do { |
3079 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 3444 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
3080 | max_load_move - total_load_moved, | 3445 | max_load_move - total_load_moved, |
3081 | sd, idle, all_pinned); | 3446 | sd, idle, lb_flags); |
3082 | 3447 | ||
3083 | total_load_moved += load_moved; | 3448 | total_load_moved += load_moved; |
3084 | 3449 | ||
3450 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3451 | break; | ||
3452 | |||
3085 | #ifdef CONFIG_PREEMPT | 3453 | #ifdef CONFIG_PREEMPT |
3086 | /* | 3454 | /* |
3087 | * NEWIDLE balancing is a source of latency, so preemptible | 3455 | * NEWIDLE balancing is a source of latency, so preemptible |
3088 | * kernels will stop after the first task is pulled to minimize | 3456 | * kernels will stop after the first task is pulled to minimize |
3089 | * the critical section. | 3457 | * the critical section. |
3090 | */ | 3458 | */ |
3091 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | 3459 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { |
3092 | break; | 3460 | *lb_flags |= LBF_ABORT; |
3093 | |||
3094 | if (raw_spin_is_contended(&this_rq->lock) || | ||
3095 | raw_spin_is_contended(&busiest->lock)) | ||
3096 | break; | 3461 | break; |
3462 | } | ||
3097 | #endif | 3463 | #endif |
3098 | } while (load_moved && max_load_move > total_load_moved); | 3464 | } while (load_moved && max_load_move > total_load_moved); |
3099 | 3465 | ||
@@ -3155,15 +3521,6 @@ struct sg_lb_stats { | |||
3155 | }; | 3521 | }; |
3156 | 3522 | ||
3157 | /** | 3523 | /** |
3158 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3159 | * @group: The group whose first cpu is to be returned. | ||
3160 | */ | ||
3161 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
3162 | { | ||
3163 | return cpumask_first(sched_group_cpus(group)); | ||
3164 | } | ||
3165 | |||
3166 | /** | ||
3167 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 3524 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
3168 | * @sd: The sched_domain whose load_idx is to be obtained. | 3525 | * @sd: The sched_domain whose load_idx is to be obtained. |
3169 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 3526 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
@@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
3412 | sdg->sgp->power = power; | 3769 | sdg->sgp->power = power; |
3413 | } | 3770 | } |
3414 | 3771 | ||
3415 | static void update_group_power(struct sched_domain *sd, int cpu) | 3772 | void update_group_power(struct sched_domain *sd, int cpu) |
3416 | { | 3773 | { |
3417 | struct sched_domain *child = sd->child; | 3774 | struct sched_domain *child = sd->child; |
3418 | struct sched_group *group, *sdg = sd->groups; | 3775 | struct sched_group *group, *sdg = sd->groups; |
@@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3678 | } while (sg != sd->groups); | 4035 | } while (sg != sd->groups); |
3679 | } | 4036 | } |
3680 | 4037 | ||
3681 | int __weak arch_sd_sibling_asym_packing(void) | ||
3682 | { | ||
3683 | return 0*SD_ASYM_PACKING; | ||
3684 | } | ||
3685 | |||
3686 | /** | 4038 | /** |
3687 | * check_asym_packing - Check to see if the group is packed into the | 4039 | * check_asym_packing - Check to see if the group is packed into the |
3688 | * sched doman. | 4040 | * sched doman. |
@@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4046 | #define MAX_PINNED_INTERVAL 512 | 4398 | #define MAX_PINNED_INTERVAL 512 |
4047 | 4399 | ||
4048 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4400 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4049 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4401 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4050 | 4402 | ||
4051 | static int need_active_balance(struct sched_domain *sd, int idle, | 4403 | static int need_active_balance(struct sched_domain *sd, int idle, |
4052 | int busiest_cpu, int this_cpu) | 4404 | int busiest_cpu, int this_cpu) |
@@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4097 | struct sched_domain *sd, enum cpu_idle_type idle, | 4449 | struct sched_domain *sd, enum cpu_idle_type idle, |
4098 | int *balance) | 4450 | int *balance) |
4099 | { | 4451 | { |
4100 | int ld_moved, all_pinned = 0, active_balance = 0; | 4452 | int ld_moved, lb_flags = 0, active_balance = 0; |
4101 | struct sched_group *group; | 4453 | struct sched_group *group; |
4102 | unsigned long imbalance; | 4454 | unsigned long imbalance; |
4103 | struct rq *busiest; | 4455 | struct rq *busiest; |
@@ -4138,11 +4490,11 @@ redo: | |||
4138 | * still unbalanced. ld_moved simply stays zero, so it is | 4490 | * still unbalanced. ld_moved simply stays zero, so it is |
4139 | * correctly treated as an imbalance. | 4491 | * correctly treated as an imbalance. |
4140 | */ | 4492 | */ |
4141 | all_pinned = 1; | 4493 | lb_flags |= LBF_ALL_PINNED; |
4142 | local_irq_save(flags); | 4494 | local_irq_save(flags); |
4143 | double_rq_lock(this_rq, busiest); | 4495 | double_rq_lock(this_rq, busiest); |
4144 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4496 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
4145 | imbalance, sd, idle, &all_pinned); | 4497 | imbalance, sd, idle, &lb_flags); |
4146 | double_rq_unlock(this_rq, busiest); | 4498 | double_rq_unlock(this_rq, busiest); |
4147 | local_irq_restore(flags); | 4499 | local_irq_restore(flags); |
4148 | 4500 | ||
@@ -4152,8 +4504,16 @@ redo: | |||
4152 | if (ld_moved && this_cpu != smp_processor_id()) | 4504 | if (ld_moved && this_cpu != smp_processor_id()) |
4153 | resched_cpu(this_cpu); | 4505 | resched_cpu(this_cpu); |
4154 | 4506 | ||
4507 | if (lb_flags & LBF_ABORT) | ||
4508 | goto out_balanced; | ||
4509 | |||
4510 | if (lb_flags & LBF_NEED_BREAK) { | ||
4511 | lb_flags &= ~LBF_NEED_BREAK; | ||
4512 | goto redo; | ||
4513 | } | ||
4514 | |||
4155 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4515 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4156 | if (unlikely(all_pinned)) { | 4516 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { |
4157 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4517 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4158 | if (!cpumask_empty(cpus)) | 4518 | if (!cpumask_empty(cpus)) |
4159 | goto redo; | 4519 | goto redo; |
@@ -4183,7 +4543,7 @@ redo: | |||
4183 | tsk_cpus_allowed(busiest->curr))) { | 4543 | tsk_cpus_allowed(busiest->curr))) { |
4184 | raw_spin_unlock_irqrestore(&busiest->lock, | 4544 | raw_spin_unlock_irqrestore(&busiest->lock, |
4185 | flags); | 4545 | flags); |
4186 | all_pinned = 1; | 4546 | lb_flags |= LBF_ALL_PINNED; |
4187 | goto out_one_pinned; | 4547 | goto out_one_pinned; |
4188 | } | 4548 | } |
4189 | 4549 | ||
@@ -4236,7 +4596,8 @@ out_balanced: | |||
4236 | 4596 | ||
4237 | out_one_pinned: | 4597 | out_one_pinned: |
4238 | /* tune up the balancing interval */ | 4598 | /* tune up the balancing interval */ |
4239 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 4599 | if (((lb_flags & LBF_ALL_PINNED) && |
4600 | sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
4240 | (sd->balance_interval < sd->max_interval)) | 4601 | (sd->balance_interval < sd->max_interval)) |
4241 | sd->balance_interval *= 2; | 4602 | sd->balance_interval *= 2; |
4242 | 4603 | ||
@@ -4249,7 +4610,7 @@ out: | |||
4249 | * idle_balance is called by schedule() if this_cpu is about to become | 4610 | * idle_balance is called by schedule() if this_cpu is about to become |
4250 | * idle. Attempts to pull tasks from other CPUs. | 4611 | * idle. Attempts to pull tasks from other CPUs. |
4251 | */ | 4612 | */ |
4252 | static void idle_balance(int this_cpu, struct rq *this_rq) | 4613 | void idle_balance(int this_cpu, struct rq *this_rq) |
4253 | { | 4614 | { |
4254 | struct sched_domain *sd; | 4615 | struct sched_domain *sd; |
4255 | int pulled_task = 0; | 4616 | int pulled_task = 0; |
@@ -4364,28 +4725,16 @@ out_unlock: | |||
4364 | #ifdef CONFIG_NO_HZ | 4725 | #ifdef CONFIG_NO_HZ |
4365 | /* | 4726 | /* |
4366 | * idle load balancing details | 4727 | * idle load balancing details |
4367 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
4368 | * entering idle. | ||
4369 | * - This idle load balancer CPU will also go into tickless mode when | ||
4370 | * it is idle, just like all other idle CPUs | ||
4371 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 4728 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
4372 | * needed, they will kick the idle load balancer, which then does idle | 4729 | * needed, they will kick the idle load balancer, which then does idle |
4373 | * load balancing for all the idle CPUs. | 4730 | * load balancing for all the idle CPUs. |
4374 | */ | 4731 | */ |
4375 | static struct { | 4732 | static struct { |
4376 | atomic_t load_balancer; | ||
4377 | atomic_t first_pick_cpu; | ||
4378 | atomic_t second_pick_cpu; | ||
4379 | cpumask_var_t idle_cpus_mask; | 4733 | cpumask_var_t idle_cpus_mask; |
4380 | cpumask_var_t grp_idle_mask; | 4734 | atomic_t nr_cpus; |
4381 | unsigned long next_balance; /* in jiffy units */ | 4735 | unsigned long next_balance; /* in jiffy units */ |
4382 | } nohz ____cacheline_aligned; | 4736 | } nohz ____cacheline_aligned; |
4383 | 4737 | ||
4384 | int get_nohz_load_balancer(void) | ||
4385 | { | ||
4386 | return atomic_read(&nohz.load_balancer); | ||
4387 | } | ||
4388 | |||
4389 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4738 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
4390 | /** | 4739 | /** |
4391 | * lowest_flag_domain - Return lowest sched_domain containing flag. | 4740 | * lowest_flag_domain - Return lowest sched_domain containing flag. |
@@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
4422 | (sd && (sd->flags & flag)); sd = sd->parent) | 4771 | (sd && (sd->flags & flag)); sd = sd->parent) |
4423 | 4772 | ||
4424 | /** | 4773 | /** |
4425 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4426 | * @ilb_group: group to be checked for semi-idleness | ||
4427 | * | ||
4428 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4429 | * | ||
4430 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4431 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4432 | * sched_group is semi-idle or not. | ||
4433 | */ | ||
4434 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4435 | { | ||
4436 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, | ||
4437 | sched_group_cpus(ilb_group)); | ||
4438 | |||
4439 | /* | ||
4440 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4441 | * and atleast one idle cpu. | ||
4442 | */ | ||
4443 | if (cpumask_empty(nohz.grp_idle_mask)) | ||
4444 | return 0; | ||
4445 | |||
4446 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) | ||
4447 | return 0; | ||
4448 | |||
4449 | return 1; | ||
4450 | } | ||
4451 | /** | ||
4452 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | 4774 | * find_new_ilb - Finds the optimum idle load balancer for nomination. |
4453 | * @cpu: The cpu which is nominating a new idle_load_balancer. | 4775 | * @cpu: The cpu which is nominating a new idle_load_balancer. |
4454 | * | 4776 | * |
@@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) | |||
4462 | */ | 4784 | */ |
4463 | static int find_new_ilb(int cpu) | 4785 | static int find_new_ilb(int cpu) |
4464 | { | 4786 | { |
4787 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
4788 | struct sched_group *ilbg; | ||
4465 | struct sched_domain *sd; | 4789 | struct sched_domain *sd; |
4466 | struct sched_group *ilb_group; | ||
4467 | int ilb = nr_cpu_ids; | ||
4468 | 4790 | ||
4469 | /* | 4791 | /* |
4470 | * Have idle load balancer selection from semi-idle packages only | 4792 | * Have idle load balancer selection from semi-idle packages only |
@@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu) | |||
4482 | 4804 | ||
4483 | rcu_read_lock(); | 4805 | rcu_read_lock(); |
4484 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 4806 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
4485 | ilb_group = sd->groups; | 4807 | ilbg = sd->groups; |
4486 | 4808 | ||
4487 | do { | 4809 | do { |
4488 | if (is_semi_idle_group(ilb_group)) { | 4810 | if (ilbg->group_weight != |
4489 | ilb = cpumask_first(nohz.grp_idle_mask); | 4811 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { |
4812 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4813 | sched_group_cpus(ilbg)); | ||
4490 | goto unlock; | 4814 | goto unlock; |
4491 | } | 4815 | } |
4492 | 4816 | ||
4493 | ilb_group = ilb_group->next; | 4817 | ilbg = ilbg->next; |
4494 | 4818 | ||
4495 | } while (ilb_group != sd->groups); | 4819 | } while (ilbg != sd->groups); |
4496 | } | 4820 | } |
4497 | unlock: | 4821 | unlock: |
4498 | rcu_read_unlock(); | 4822 | rcu_read_unlock(); |
4499 | 4823 | ||
4500 | out_done: | 4824 | out_done: |
4501 | return ilb; | 4825 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4826 | return ilb; | ||
4827 | |||
4828 | return nr_cpu_ids; | ||
4502 | } | 4829 | } |
4503 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 4830 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
4504 | static inline int find_new_ilb(int call_cpu) | 4831 | static inline int find_new_ilb(int call_cpu) |
@@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu) | |||
4518 | 4845 | ||
4519 | nohz.next_balance++; | 4846 | nohz.next_balance++; |
4520 | 4847 | ||
4521 | ilb_cpu = get_nohz_load_balancer(); | 4848 | ilb_cpu = find_new_ilb(cpu); |
4522 | |||
4523 | if (ilb_cpu >= nr_cpu_ids) { | ||
4524 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
4525 | if (ilb_cpu >= nr_cpu_ids) | ||
4526 | return; | ||
4527 | } | ||
4528 | 4849 | ||
4529 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4850 | if (ilb_cpu >= nr_cpu_ids) |
4530 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4851 | return; |
4531 | 4852 | ||
4532 | smp_mb(); | 4853 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) |
4533 | /* | 4854 | return; |
4534 | * Use smp_send_reschedule() instead of resched_cpu(). | 4855 | /* |
4535 | * This way we generate a sched IPI on the target cpu which | 4856 | * Use smp_send_reschedule() instead of resched_cpu(). |
4536 | * is idle. And the softirq performing nohz idle load balance | 4857 | * This way we generate a sched IPI on the target cpu which |
4537 | * will be run before returning from the IPI. | 4858 | * is idle. And the softirq performing nohz idle load balance |
4538 | */ | 4859 | * will be run before returning from the IPI. |
4539 | smp_send_reschedule(ilb_cpu); | 4860 | */ |
4540 | } | 4861 | smp_send_reschedule(ilb_cpu); |
4541 | return; | 4862 | return; |
4542 | } | 4863 | } |
4543 | 4864 | ||
4544 | /* | 4865 | static inline void set_cpu_sd_state_busy(void) |
4545 | * This routine will try to nominate the ilb (idle load balancing) | ||
4546 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
4547 | * load balancing on behalf of all those cpus. | ||
4548 | * | ||
4549 | * When the ilb owner becomes busy, we will not have new ilb owner until some | ||
4550 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick | ||
4551 | * idle load balancing by kicking one of the idle CPUs. | ||
4552 | * | ||
4553 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this | ||
4554 | * ilb owner CPU in future (when there is a need for idle load balancing on | ||
4555 | * behalf of all idle CPUs). | ||
4556 | */ | ||
4557 | void select_nohz_load_balancer(int stop_tick) | ||
4558 | { | 4866 | { |
4867 | struct sched_domain *sd; | ||
4559 | int cpu = smp_processor_id(); | 4868 | int cpu = smp_processor_id(); |
4560 | 4869 | ||
4561 | if (stop_tick) { | 4870 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
4562 | if (!cpu_active(cpu)) { | 4871 | return; |
4563 | if (atomic_read(&nohz.load_balancer) != cpu) | 4872 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); |
4564 | return; | ||
4565 | |||
4566 | /* | ||
4567 | * If we are going offline and still the leader, | ||
4568 | * give up! | ||
4569 | */ | ||
4570 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
4571 | nr_cpu_ids) != cpu) | ||
4572 | BUG(); | ||
4573 | 4873 | ||
4574 | return; | 4874 | rcu_read_lock(); |
4575 | } | 4875 | for_each_domain(cpu, sd) |
4876 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
4877 | rcu_read_unlock(); | ||
4878 | } | ||
4576 | 4879 | ||
4577 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4880 | void set_cpu_sd_state_idle(void) |
4881 | { | ||
4882 | struct sched_domain *sd; | ||
4883 | int cpu = smp_processor_id(); | ||
4578 | 4884 | ||
4579 | if (atomic_read(&nohz.first_pick_cpu) == cpu) | 4885 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
4580 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); | 4886 | return; |
4581 | if (atomic_read(&nohz.second_pick_cpu) == cpu) | 4887 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); |
4582 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
4583 | 4888 | ||
4584 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { | 4889 | rcu_read_lock(); |
4585 | int new_ilb; | 4890 | for_each_domain(cpu, sd) |
4891 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
4892 | rcu_read_unlock(); | ||
4893 | } | ||
4586 | 4894 | ||
4587 | /* make me the ilb owner */ | 4895 | /* |
4588 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, | 4896 | * This routine will record that this cpu is going idle with tick stopped. |
4589 | cpu) != nr_cpu_ids) | 4897 | * This info will be used in performing idle load balancing in the future. |
4590 | return; | 4898 | */ |
4899 | void select_nohz_load_balancer(int stop_tick) | ||
4900 | { | ||
4901 | int cpu = smp_processor_id(); | ||
4591 | 4902 | ||
4592 | /* | 4903 | if (stop_tick) { |
4593 | * Check to see if there is a more power-efficient | 4904 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
4594 | * ilb. | ||
4595 | */ | ||
4596 | new_ilb = find_new_ilb(cpu); | ||
4597 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4598 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
4599 | resched_cpu(new_ilb); | ||
4600 | return; | ||
4601 | } | ||
4602 | return; | ||
4603 | } | ||
4604 | } else { | ||
4605 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
4606 | return; | 4905 | return; |
4607 | 4906 | ||
4608 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4907 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
4609 | 4908 | atomic_inc(&nohz.nr_cpus); | |
4610 | if (atomic_read(&nohz.load_balancer) == cpu) | 4909 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4611 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
4612 | nr_cpu_ids) != cpu) | ||
4613 | BUG(); | ||
4614 | } | 4910 | } |
4615 | return; | 4911 | return; |
4616 | } | 4912 | } |
@@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
4624 | * Scale the max load_balance interval with the number of CPUs in the system. | 4920 | * Scale the max load_balance interval with the number of CPUs in the system. |
4625 | * This trades load-balance latency on larger machines for less cross talk. | 4921 | * This trades load-balance latency on larger machines for less cross talk. |
4626 | */ | 4922 | */ |
4627 | static void update_max_interval(void) | 4923 | void update_max_interval(void) |
4628 | { | 4924 | { |
4629 | max_load_balance_interval = HZ*num_online_cpus()/10; | 4925 | max_load_balance_interval = HZ*num_online_cpus()/10; |
4630 | } | 4926 | } |
@@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4716 | struct rq *rq; | 5012 | struct rq *rq; |
4717 | int balance_cpu; | 5013 | int balance_cpu; |
4718 | 5014 | ||
4719 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | 5015 | if (idle != CPU_IDLE || |
4720 | return; | 5016 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
5017 | goto end; | ||
4721 | 5018 | ||
4722 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 5019 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { |
4723 | if (balance_cpu == this_cpu) | 5020 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) |
4724 | continue; | 5021 | continue; |
4725 | 5022 | ||
4726 | /* | 5023 | /* |
@@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4728 | * work being done for other cpus. Next load | 5025 | * work being done for other cpus. Next load |
4729 | * balancing owner will pick it up. | 5026 | * balancing owner will pick it up. |
4730 | */ | 5027 | */ |
4731 | if (need_resched()) { | 5028 | if (need_resched()) |
4732 | this_rq->nohz_balance_kick = 0; | ||
4733 | break; | 5029 | break; |
4734 | } | ||
4735 | 5030 | ||
4736 | raw_spin_lock_irq(&this_rq->lock); | 5031 | raw_spin_lock_irq(&this_rq->lock); |
4737 | update_rq_clock(this_rq); | 5032 | update_rq_clock(this_rq); |
@@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4745 | this_rq->next_balance = rq->next_balance; | 5040 | this_rq->next_balance = rq->next_balance; |
4746 | } | 5041 | } |
4747 | nohz.next_balance = this_rq->next_balance; | 5042 | nohz.next_balance = this_rq->next_balance; |
4748 | this_rq->nohz_balance_kick = 0; | 5043 | end: |
5044 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
4749 | } | 5045 | } |
4750 | 5046 | ||
4751 | /* | 5047 | /* |
4752 | * Current heuristic for kicking the idle load balancer | 5048 | * Current heuristic for kicking the idle load balancer in the presence |
4753 | * - first_pick_cpu is the one of the busy CPUs. It will kick | 5049 | * of an idle cpu is the system. |
4754 | * idle load balancer when it has more than one process active. This | 5050 | * - This rq has more than one task. |
4755 | * eliminates the need for idle load balancing altogether when we have | 5051 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
4756 | * only one running process in the system (common case). | 5052 | * busy cpu's exceeding the group's power. |
4757 | * - If there are more than one busy CPU, idle load balancer may have | 5053 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
4758 | * to run for active_load_balance to happen (i.e., two busy CPUs are | 5054 | * domain span are idle. |
4759 | * SMT or core siblings and can run better if they move to different | ||
4760 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
4761 | * which will kick idle load balancer as soon as it has any load. | ||
4762 | */ | 5055 | */ |
4763 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 5056 | static inline int nohz_kick_needed(struct rq *rq, int cpu) |
4764 | { | 5057 | { |
4765 | unsigned long now = jiffies; | 5058 | unsigned long now = jiffies; |
4766 | int ret; | 5059 | struct sched_domain *sd; |
4767 | int first_pick_cpu, second_pick_cpu; | ||
4768 | 5060 | ||
4769 | if (time_before(now, nohz.next_balance)) | 5061 | if (unlikely(idle_cpu(cpu))) |
4770 | return 0; | 5062 | return 0; |
4771 | 5063 | ||
4772 | if (idle_cpu(cpu)) | 5064 | /* |
4773 | return 0; | 5065 | * We may be recently in ticked or tickless idle mode. At the first |
5066 | * busy tick after returning from idle, we will update the busy stats. | ||
5067 | */ | ||
5068 | set_cpu_sd_state_busy(); | ||
5069 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
5070 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
5071 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
5072 | atomic_dec(&nohz.nr_cpus); | ||
5073 | } | ||
4774 | 5074 | ||
4775 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 5075 | /* |
4776 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | 5076 | * None are in tickless mode and hence no need for NOHZ idle load |
5077 | * balancing. | ||
5078 | */ | ||
5079 | if (likely(!atomic_read(&nohz.nr_cpus))) | ||
5080 | return 0; | ||
4777 | 5081 | ||
4778 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | 5082 | if (time_before(now, nohz.next_balance)) |
4779 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
4780 | return 0; | 5083 | return 0; |
4781 | 5084 | ||
4782 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | 5085 | if (rq->nr_running >= 2) |
4783 | if (ret == nr_cpu_ids || ret == cpu) { | 5086 | goto need_kick; |
4784 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 5087 | |
4785 | if (rq->nr_running > 1) | 5088 | rcu_read_lock(); |
4786 | return 1; | 5089 | for_each_domain(cpu, sd) { |
4787 | } else { | 5090 | struct sched_group *sg = sd->groups; |
4788 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | 5091 | struct sched_group_power *sgp = sg->sgp; |
4789 | if (ret == nr_cpu_ids || ret == cpu) { | 5092 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); |
4790 | if (rq->nr_running) | 5093 | |
4791 | return 1; | 5094 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) |
4792 | } | 5095 | goto need_kick_unlock; |
5096 | |||
5097 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | ||
5098 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
5099 | sched_domain_span(sd)) < cpu)) | ||
5100 | goto need_kick_unlock; | ||
5101 | |||
5102 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
5103 | break; | ||
4793 | } | 5104 | } |
5105 | rcu_read_unlock(); | ||
4794 | return 0; | 5106 | return 0; |
5107 | |||
5108 | need_kick_unlock: | ||
5109 | rcu_read_unlock(); | ||
5110 | need_kick: | ||
5111 | return 1; | ||
4795 | } | 5112 | } |
4796 | #else | 5113 | #else |
4797 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 5114 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } |
@@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu) | |||
4826 | /* | 5143 | /* |
4827 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 5144 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
4828 | */ | 5145 | */ |
4829 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 5146 | void trigger_load_balance(struct rq *rq, int cpu) |
4830 | { | 5147 | { |
4831 | /* Don't need to rebalance while attached to NULL domain */ | 5148 | /* Don't need to rebalance while attached to NULL domain */ |
4832 | if (time_after_eq(jiffies, rq->next_balance) && | 5149 | if (time_after_eq(jiffies, rq->next_balance) && |
4833 | likely(!on_null_domain(cpu))) | 5150 | likely(!on_null_domain(cpu))) |
4834 | raise_softirq(SCHED_SOFTIRQ); | 5151 | raise_softirq(SCHED_SOFTIRQ); |
4835 | #ifdef CONFIG_NO_HZ | 5152 | #ifdef CONFIG_NO_HZ |
4836 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5153 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
4837 | nohz_balancer_kick(cpu); | 5154 | nohz_balancer_kick(cpu); |
4838 | #endif | 5155 | #endif |
4839 | } | 5156 | } |
@@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq) | |||
4848 | update_sysctl(); | 5165 | update_sysctl(); |
4849 | } | 5166 | } |
4850 | 5167 | ||
4851 | #else /* CONFIG_SMP */ | ||
4852 | |||
4853 | /* | ||
4854 | * on UP we do not need to balance between CPUs: | ||
4855 | */ | ||
4856 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4857 | { | ||
4858 | } | ||
4859 | |||
4860 | #endif /* CONFIG_SMP */ | 5168 | #endif /* CONFIG_SMP */ |
4861 | 5169 | ||
4862 | /* | 5170 | /* |
@@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4880 | */ | 5188 | */ |
4881 | static void task_fork_fair(struct task_struct *p) | 5189 | static void task_fork_fair(struct task_struct *p) |
4882 | { | 5190 | { |
4883 | struct cfs_rq *cfs_rq = task_cfs_rq(current); | 5191 | struct cfs_rq *cfs_rq; |
4884 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; | 5192 | struct sched_entity *se = &p->se, *curr; |
4885 | int this_cpu = smp_processor_id(); | 5193 | int this_cpu = smp_processor_id(); |
4886 | struct rq *rq = this_rq(); | 5194 | struct rq *rq = this_rq(); |
4887 | unsigned long flags; | 5195 | unsigned long flags; |
@@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p) | |||
4890 | 5198 | ||
4891 | update_rq_clock(rq); | 5199 | update_rq_clock(rq); |
4892 | 5200 | ||
5201 | cfs_rq = task_cfs_rq(current); | ||
5202 | curr = cfs_rq->curr; | ||
5203 | |||
4893 | if (unlikely(task_cpu(p) != this_cpu)) { | 5204 | if (unlikely(task_cpu(p) != this_cpu)) { |
4894 | rcu_read_lock(); | 5205 | rcu_read_lock(); |
4895 | __set_task_cpu(p, this_cpu); | 5206 | __set_task_cpu(p, this_cpu); |
@@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq) | |||
4999 | } | 5310 | } |
5000 | } | 5311 | } |
5001 | 5312 | ||
5313 | void init_cfs_rq(struct cfs_rq *cfs_rq) | ||
5314 | { | ||
5315 | cfs_rq->tasks_timeline = RB_ROOT; | ||
5316 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5317 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
5318 | #ifndef CONFIG_64BIT | ||
5319 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
5320 | #endif | ||
5321 | } | ||
5322 | |||
5002 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5323 | #ifdef CONFIG_FAIR_GROUP_SCHED |
5003 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5324 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
5004 | { | 5325 | { |
@@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
5015 | * to another cgroup's rq. This does somewhat interfere with the | 5336 | * to another cgroup's rq. This does somewhat interfere with the |
5016 | * fair sleeper stuff for the first placement, but who cares. | 5337 | * fair sleeper stuff for the first placement, but who cares. |
5017 | */ | 5338 | */ |
5339 | /* | ||
5340 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
5341 | * But there are some cases where it has already been normalized: | ||
5342 | * | ||
5343 | * - Moving a forked child which is waiting for being woken up by | ||
5344 | * wake_up_new_task(). | ||
5345 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
5346 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
5347 | * | ||
5348 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
5349 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
5350 | */ | ||
5351 | if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) | ||
5352 | on_rq = 1; | ||
5353 | |||
5018 | if (!on_rq) | 5354 | if (!on_rq) |
5019 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5355 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
5020 | set_task_rq(p, task_cpu(p)); | 5356 | set_task_rq(p, task_cpu(p)); |
5021 | if (!on_rq) | 5357 | if (!on_rq) |
5022 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5358 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
5023 | } | 5359 | } |
5360 | |||
5361 | void free_fair_sched_group(struct task_group *tg) | ||
5362 | { | ||
5363 | int i; | ||
5364 | |||
5365 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5366 | |||
5367 | for_each_possible_cpu(i) { | ||
5368 | if (tg->cfs_rq) | ||
5369 | kfree(tg->cfs_rq[i]); | ||
5370 | if (tg->se) | ||
5371 | kfree(tg->se[i]); | ||
5372 | } | ||
5373 | |||
5374 | kfree(tg->cfs_rq); | ||
5375 | kfree(tg->se); | ||
5376 | } | ||
5377 | |||
5378 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5379 | { | ||
5380 | struct cfs_rq *cfs_rq; | ||
5381 | struct sched_entity *se; | ||
5382 | int i; | ||
5383 | |||
5384 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
5385 | if (!tg->cfs_rq) | ||
5386 | goto err; | ||
5387 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
5388 | if (!tg->se) | ||
5389 | goto err; | ||
5390 | |||
5391 | tg->shares = NICE_0_LOAD; | ||
5392 | |||
5393 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5394 | |||
5395 | for_each_possible_cpu(i) { | ||
5396 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
5397 | GFP_KERNEL, cpu_to_node(i)); | ||
5398 | if (!cfs_rq) | ||
5399 | goto err; | ||
5400 | |||
5401 | se = kzalloc_node(sizeof(struct sched_entity), | ||
5402 | GFP_KERNEL, cpu_to_node(i)); | ||
5403 | if (!se) | ||
5404 | goto err_free_rq; | ||
5405 | |||
5406 | init_cfs_rq(cfs_rq); | ||
5407 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
5408 | } | ||
5409 | |||
5410 | return 1; | ||
5411 | |||
5412 | err_free_rq: | ||
5413 | kfree(cfs_rq); | ||
5414 | err: | ||
5415 | return 0; | ||
5416 | } | ||
5417 | |||
5418 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
5419 | { | ||
5420 | struct rq *rq = cpu_rq(cpu); | ||
5421 | unsigned long flags; | ||
5422 | |||
5423 | /* | ||
5424 | * Only empty task groups can be destroyed; so we can speculatively | ||
5425 | * check on_list without danger of it being re-added. | ||
5426 | */ | ||
5427 | if (!tg->cfs_rq[cpu]->on_list) | ||
5428 | return; | ||
5429 | |||
5430 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5431 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
5432 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5433 | } | ||
5434 | |||
5435 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
5436 | struct sched_entity *se, int cpu, | ||
5437 | struct sched_entity *parent) | ||
5438 | { | ||
5439 | struct rq *rq = cpu_rq(cpu); | ||
5440 | |||
5441 | cfs_rq->tg = tg; | ||
5442 | cfs_rq->rq = rq; | ||
5443 | #ifdef CONFIG_SMP | ||
5444 | /* allow initial update_cfs_load() to truncate */ | ||
5445 | cfs_rq->load_stamp = 1; | ||
5024 | #endif | 5446 | #endif |
5447 | init_cfs_rq_runtime(cfs_rq); | ||
5448 | |||
5449 | tg->cfs_rq[cpu] = cfs_rq; | ||
5450 | tg->se[cpu] = se; | ||
5451 | |||
5452 | /* se could be NULL for root_task_group */ | ||
5453 | if (!se) | ||
5454 | return; | ||
5455 | |||
5456 | if (!parent) | ||
5457 | se->cfs_rq = &rq->cfs; | ||
5458 | else | ||
5459 | se->cfs_rq = parent->my_q; | ||
5460 | |||
5461 | se->my_q = cfs_rq; | ||
5462 | update_load_set(&se->load, 0); | ||
5463 | se->parent = parent; | ||
5464 | } | ||
5465 | |||
5466 | static DEFINE_MUTEX(shares_mutex); | ||
5467 | |||
5468 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
5469 | { | ||
5470 | int i; | ||
5471 | unsigned long flags; | ||
5472 | |||
5473 | /* | ||
5474 | * We can't change the weight of the root cgroup. | ||
5475 | */ | ||
5476 | if (!tg->se[0]) | ||
5477 | return -EINVAL; | ||
5478 | |||
5479 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
5480 | |||
5481 | mutex_lock(&shares_mutex); | ||
5482 | if (tg->shares == shares) | ||
5483 | goto done; | ||
5484 | |||
5485 | tg->shares = shares; | ||
5486 | for_each_possible_cpu(i) { | ||
5487 | struct rq *rq = cpu_rq(i); | ||
5488 | struct sched_entity *se; | ||
5489 | |||
5490 | se = tg->se[i]; | ||
5491 | /* Propagate contribution to hierarchy */ | ||
5492 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5493 | for_each_sched_entity(se) | ||
5494 | update_cfs_shares(group_cfs_rq(se)); | ||
5495 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5496 | } | ||
5497 | |||
5498 | done: | ||
5499 | mutex_unlock(&shares_mutex); | ||
5500 | return 0; | ||
5501 | } | ||
5502 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
5503 | |||
5504 | void free_fair_sched_group(struct task_group *tg) { } | ||
5505 | |||
5506 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5507 | { | ||
5508 | return 1; | ||
5509 | } | ||
5510 | |||
5511 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | ||
5512 | |||
5513 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
5514 | |||
5025 | 5515 | ||
5026 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 5516 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
5027 | { | 5517 | { |
@@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |||
5041 | /* | 5531 | /* |
5042 | * All the scheduling class methods: | 5532 | * All the scheduling class methods: |
5043 | */ | 5533 | */ |
5044 | static const struct sched_class fair_sched_class = { | 5534 | const struct sched_class fair_sched_class = { |
5045 | .next = &idle_sched_class, | 5535 | .next = &idle_sched_class, |
5046 | .enqueue_task = enqueue_task_fair, | 5536 | .enqueue_task = enqueue_task_fair, |
5047 | .dequeue_task = dequeue_task_fair, | 5537 | .dequeue_task = dequeue_task_fair, |
@@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = { | |||
5078 | }; | 5568 | }; |
5079 | 5569 | ||
5080 | #ifdef CONFIG_SCHED_DEBUG | 5570 | #ifdef CONFIG_SCHED_DEBUG |
5081 | static void print_cfs_stats(struct seq_file *m, int cpu) | 5571 | void print_cfs_stats(struct seq_file *m, int cpu) |
5082 | { | 5572 | { |
5083 | struct cfs_rq *cfs_rq; | 5573 | struct cfs_rq *cfs_rq; |
5084 | 5574 | ||
@@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
5088 | rcu_read_unlock(); | 5578 | rcu_read_unlock(); |
5089 | } | 5579 | } |
5090 | #endif | 5580 | #endif |
5581 | |||
5582 | __init void init_sched_fair_class(void) | ||
5583 | { | ||
5584 | #ifdef CONFIG_SMP | ||
5585 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
5586 | |||
5587 | #ifdef CONFIG_NO_HZ | ||
5588 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
5589 | #endif | ||
5590 | #endif /* SMP */ | ||
5591 | |||
5592 | } | ||
diff --git a/kernel/sched_features.h b/kernel/sched/features.h index 84802245abd2..e61fd73913d0 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h | |||
@@ -3,13 +3,13 @@ | |||
3 | * them to run sooner, but does not allow tons of sleepers to | 3 | * them to run sooner, but does not allow tons of sleepers to |
4 | * rip the spread apart. | 4 | * rip the spread apart. |
5 | */ | 5 | */ |
6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | 6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * Place new tasks ahead so that they do not starve already running | 9 | * Place new tasks ahead so that they do not starve already running |
10 | * tasks | 10 | * tasks |
11 | */ | 11 | */ |
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, true) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1) | |||
17 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. |
19 | */ | 19 | */ |
20 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * Prefer to schedule the task we woke last (assuming it failed | 23 | * Prefer to schedule the task we woke last (assuming it failed |
24 | * wakeup-preemption), since its likely going to consume data we | 24 | * wakeup-preemption), since its likely going to consume data we |
25 | * touched, increases cache locality. | 25 | * touched, increases cache locality. |
26 | */ | 26 | */ |
27 | SCHED_FEAT(NEXT_BUDDY, 0) | 27 | SCHED_FEAT(NEXT_BUDDY, false) |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * Prefer to schedule the task that ran last (when we did | 30 | * Prefer to schedule the task that ran last (when we did |
31 | * wake-preempt) as that likely will touch the same data, increases | 31 | * wake-preempt) as that likely will touch the same data, increases |
32 | * cache locality. | 32 | * cache locality. |
33 | */ | 33 | */ |
34 | SCHED_FEAT(LAST_BUDDY, 1) | 34 | SCHED_FEAT(LAST_BUDDY, true) |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * Consider buddies to be cache hot, decreases the likelyness of a | 37 | * Consider buddies to be cache hot, decreases the likelyness of a |
38 | * cache buddy being migrated away, increases cache locality. | 38 | * cache buddy being migrated away, increases cache locality. |
39 | */ | 39 | */ |
40 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | 40 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
41 | 41 | ||
42 | /* | 42 | /* |
43 | * Use arch dependent cpu power functions | 43 | * Use arch dependent cpu power functions |
44 | */ | 44 | */ |
45 | SCHED_FEAT(ARCH_POWER, 0) | 45 | SCHED_FEAT(ARCH_POWER, false) |
46 | 46 | ||
47 | SCHED_FEAT(HRTICK, 0) | 47 | SCHED_FEAT(HRTICK, false) |
48 | SCHED_FEAT(DOUBLE_TICK, 0) | 48 | SCHED_FEAT(DOUBLE_TICK, false) |
49 | SCHED_FEAT(LB_BIAS, 1) | 49 | SCHED_FEAT(LB_BIAS, true) |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Spin-wait on mutex acquisition when the mutex owner is running on | 52 | * Spin-wait on mutex acquisition when the mutex owner is running on |
53 | * another cpu -- assumes that when the owner is running, it will soon | 53 | * another cpu -- assumes that when the owner is running, it will soon |
54 | * release the lock. Decreases scheduling overhead. | 54 | * release the lock. Decreases scheduling overhead. |
55 | */ | 55 | */ |
56 | SCHED_FEAT(OWNER_SPIN, 1) | 56 | SCHED_FEAT(OWNER_SPIN, true) |
57 | 57 | ||
58 | /* | 58 | /* |
59 | * Decrement CPU power based on time not spent running tasks | 59 | * Decrement CPU power based on time not spent running tasks |
60 | */ | 60 | */ |
61 | SCHED_FEAT(NONTASK_POWER, 1) | 61 | SCHED_FEAT(NONTASK_POWER, true) |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Queue remote wakeups on the target CPU and process them | 64 | * Queue remote wakeups on the target CPU and process them |
65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | 65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. |
66 | */ | 66 | */ |
67 | SCHED_FEAT(TTWU_QUEUE, 1) | 67 | SCHED_FEAT(TTWU_QUEUE, true) |
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, 1) | 70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534ea..91b4c957f289 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
3 | * | 5 | * |
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task | |||
71 | /* | 73 | /* |
72 | * Simple, special scheduling class for the per-CPU idle tasks: | 74 | * Simple, special scheduling class for the per-CPU idle tasks: |
73 | */ | 75 | */ |
74 | static const struct sched_class idle_sched_class = { | 76 | const struct sched_class idle_sched_class = { |
75 | /* .next is NULL */ | 77 | /* .next is NULL */ |
76 | /* no enqueue/yield_task for idle tasks */ | 78 | /* no enqueue/yield_task for idle tasks */ |
77 | 79 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index 583a1368afe6..3640ebbb466b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,7 +3,92 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include "sched.h" | ||
7 | |||
8 | #include <linux/slab.h> | ||
9 | |||
10 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
11 | |||
12 | struct rt_bandwidth def_rt_bandwidth; | ||
13 | |||
14 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
15 | { | ||
16 | struct rt_bandwidth *rt_b = | ||
17 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
18 | ktime_t now; | ||
19 | int overrun; | ||
20 | int idle = 0; | ||
21 | |||
22 | for (;;) { | ||
23 | now = hrtimer_cb_get_time(timer); | ||
24 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
25 | |||
26 | if (!overrun) | ||
27 | break; | ||
28 | |||
29 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
30 | } | ||
31 | |||
32 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
33 | } | ||
34 | |||
35 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
36 | { | ||
37 | rt_b->rt_period = ns_to_ktime(period); | ||
38 | rt_b->rt_runtime = runtime; | ||
39 | |||
40 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
41 | |||
42 | hrtimer_init(&rt_b->rt_period_timer, | ||
43 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
44 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
45 | } | ||
46 | |||
47 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
48 | { | ||
49 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
50 | return; | ||
51 | |||
52 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
53 | return; | ||
54 | |||
55 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
56 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
57 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
58 | } | ||
59 | |||
60 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
61 | { | ||
62 | struct rt_prio_array *array; | ||
63 | int i; | ||
64 | |||
65 | array = &rt_rq->active; | ||
66 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
67 | INIT_LIST_HEAD(array->queue + i); | ||
68 | __clear_bit(i, array->bitmap); | ||
69 | } | ||
70 | /* delimiter for bitsearch: */ | ||
71 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
72 | |||
73 | #if defined CONFIG_SMP | ||
74 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
75 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
76 | rt_rq->rt_nr_migratory = 0; | ||
77 | rt_rq->overloaded = 0; | ||
78 | plist_head_init(&rt_rq->pushable_tasks); | ||
79 | #endif | ||
80 | |||
81 | rt_rq->rt_time = 0; | ||
82 | rt_rq->rt_throttled = 0; | ||
83 | rt_rq->rt_runtime = 0; | ||
84 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
85 | } | ||
86 | |||
6 | #ifdef CONFIG_RT_GROUP_SCHED | 87 | #ifdef CONFIG_RT_GROUP_SCHED |
88 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
89 | { | ||
90 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
91 | } | ||
7 | 92 | ||
8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | 93 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) |
9 | 94 | ||
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
25 | return rt_se->rt_rq; | 110 | return rt_se->rt_rq; |
26 | } | 111 | } |
27 | 112 | ||
113 | void free_rt_sched_group(struct task_group *tg) | ||
114 | { | ||
115 | int i; | ||
116 | |||
117 | if (tg->rt_se) | ||
118 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
119 | |||
120 | for_each_possible_cpu(i) { | ||
121 | if (tg->rt_rq) | ||
122 | kfree(tg->rt_rq[i]); | ||
123 | if (tg->rt_se) | ||
124 | kfree(tg->rt_se[i]); | ||
125 | } | ||
126 | |||
127 | kfree(tg->rt_rq); | ||
128 | kfree(tg->rt_se); | ||
129 | } | ||
130 | |||
131 | void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
132 | struct sched_rt_entity *rt_se, int cpu, | ||
133 | struct sched_rt_entity *parent) | ||
134 | { | ||
135 | struct rq *rq = cpu_rq(cpu); | ||
136 | |||
137 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
138 | rt_rq->rt_nr_boosted = 0; | ||
139 | rt_rq->rq = rq; | ||
140 | rt_rq->tg = tg; | ||
141 | |||
142 | tg->rt_rq[cpu] = rt_rq; | ||
143 | tg->rt_se[cpu] = rt_se; | ||
144 | |||
145 | if (!rt_se) | ||
146 | return; | ||
147 | |||
148 | if (!parent) | ||
149 | rt_se->rt_rq = &rq->rt; | ||
150 | else | ||
151 | rt_se->rt_rq = parent->my_q; | ||
152 | |||
153 | rt_se->my_q = rt_rq; | ||
154 | rt_se->parent = parent; | ||
155 | INIT_LIST_HEAD(&rt_se->run_list); | ||
156 | } | ||
157 | |||
158 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
159 | { | ||
160 | struct rt_rq *rt_rq; | ||
161 | struct sched_rt_entity *rt_se; | ||
162 | int i; | ||
163 | |||
164 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
165 | if (!tg->rt_rq) | ||
166 | goto err; | ||
167 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
168 | if (!tg->rt_se) | ||
169 | goto err; | ||
170 | |||
171 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
172 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
173 | |||
174 | for_each_possible_cpu(i) { | ||
175 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
176 | GFP_KERNEL, cpu_to_node(i)); | ||
177 | if (!rt_rq) | ||
178 | goto err; | ||
179 | |||
180 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
181 | GFP_KERNEL, cpu_to_node(i)); | ||
182 | if (!rt_se) | ||
183 | goto err_free_rq; | ||
184 | |||
185 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
186 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
187 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
188 | } | ||
189 | |||
190 | return 1; | ||
191 | |||
192 | err_free_rq: | ||
193 | kfree(rt_rq); | ||
194 | err: | ||
195 | return 0; | ||
196 | } | ||
197 | |||
28 | #else /* CONFIG_RT_GROUP_SCHED */ | 198 | #else /* CONFIG_RT_GROUP_SCHED */ |
29 | 199 | ||
30 | #define rt_entity_is_task(rt_se) (1) | 200 | #define rt_entity_is_task(rt_se) (1) |
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
47 | return &rq->rt; | 217 | return &rq->rt; |
48 | } | 218 | } |
49 | 219 | ||
220 | void free_rt_sched_group(struct task_group *tg) { } | ||
221 | |||
222 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
223 | { | ||
224 | return 1; | ||
225 | } | ||
50 | #endif /* CONFIG_RT_GROUP_SCHED */ | 226 | #endif /* CONFIG_RT_GROUP_SCHED */ |
51 | 227 | ||
52 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq) | |||
556 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 732 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
557 | } | 733 | } |
558 | 734 | ||
735 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
736 | { | ||
737 | int cpu = (int)(long)hcpu; | ||
738 | |||
739 | switch (action) { | ||
740 | case CPU_DOWN_PREPARE: | ||
741 | case CPU_DOWN_PREPARE_FROZEN: | ||
742 | disable_runtime(cpu_rq(cpu)); | ||
743 | return NOTIFY_OK; | ||
744 | |||
745 | case CPU_DOWN_FAILED: | ||
746 | case CPU_DOWN_FAILED_FROZEN: | ||
747 | case CPU_ONLINE: | ||
748 | case CPU_ONLINE_FROZEN: | ||
749 | enable_runtime(cpu_rq(cpu)); | ||
750 | return NOTIFY_OK; | ||
751 | |||
752 | default: | ||
753 | return NOTIFY_DONE; | ||
754 | } | ||
755 | } | ||
756 | |||
559 | static int balance_runtime(struct rt_rq *rt_rq) | 757 | static int balance_runtime(struct rt_rq *rt_rq) |
560 | { | 758 | { |
561 | int more = 0; | 759 | int more = 0; |
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
648 | if (rt_rq->rt_throttled) | 846 | if (rt_rq->rt_throttled) |
649 | return rt_rq_throttled(rt_rq); | 847 | return rt_rq_throttled(rt_rq); |
650 | 848 | ||
651 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 849 | if (runtime >= sched_rt_period(rt_rq)) |
652 | return 0; | 850 | return 0; |
653 | 851 | ||
654 | balance_runtime(rt_rq); | 852 | balance_runtime(rt_rq); |
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
957 | } | 1155 | } |
958 | 1156 | ||
959 | /* | 1157 | /* |
960 | * Put task to the end of the run list without the overhead of dequeue | 1158 | * Put task to the head or the end of the run list without the overhead of |
961 | * followed by enqueue. | 1159 | * dequeue followed by enqueue. |
962 | */ | 1160 | */ |
963 | static void | 1161 | static void |
964 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) | 1162 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) |
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1002 | 1200 | ||
1003 | cpu = task_cpu(p); | 1201 | cpu = task_cpu(p); |
1004 | 1202 | ||
1203 | if (p->rt.nr_cpus_allowed == 1) | ||
1204 | goto out; | ||
1205 | |||
1005 | /* For anything but wake ups, just return the task_cpu */ | 1206 | /* For anything but wake ups, just return the task_cpu */ |
1006 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1207 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
1007 | goto out; | 1208 | goto out; |
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1178 | /* Only try algorithms three times */ | 1379 | /* Only try algorithms three times */ |
1179 | #define RT_MAX_TRIES 3 | 1380 | #define RT_MAX_TRIES 3 |
1180 | 1381 | ||
1181 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | ||
1182 | |||
1183 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1382 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1184 | { | 1383 | { |
1185 | if (!task_running(rq, p) && | 1384 | if (!task_running(rq, p) && |
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1653 | pull_rt_task(rq); | 1852 | pull_rt_task(rq); |
1654 | } | 1853 | } |
1655 | 1854 | ||
1656 | static inline void init_sched_rt_class(void) | 1855 | void init_sched_rt_class(void) |
1657 | { | 1856 | { |
1658 | unsigned int i; | 1857 | unsigned int i; |
1659 | 1858 | ||
1660 | for_each_possible_cpu(i) | 1859 | for_each_possible_cpu(i) { |
1661 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), | 1860 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), |
1662 | GFP_KERNEL, cpu_to_node(i)); | 1861 | GFP_KERNEL, cpu_to_node(i)); |
1862 | } | ||
1663 | } | 1863 | } |
1664 | #endif /* CONFIG_SMP */ | 1864 | #endif /* CONFIG_SMP */ |
1665 | 1865 | ||
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
1800 | return 0; | 2000 | return 0; |
1801 | } | 2001 | } |
1802 | 2002 | ||
1803 | static const struct sched_class rt_sched_class = { | 2003 | const struct sched_class rt_sched_class = { |
1804 | .next = &fair_sched_class, | 2004 | .next = &fair_sched_class, |
1805 | .enqueue_task = enqueue_task_rt, | 2005 | .enqueue_task = enqueue_task_rt, |
1806 | .dequeue_task = dequeue_task_rt, | 2006 | .dequeue_task = dequeue_task_rt, |
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = { | |||
1835 | #ifdef CONFIG_SCHED_DEBUG | 2035 | #ifdef CONFIG_SCHED_DEBUG |
1836 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | 2036 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); |
1837 | 2037 | ||
1838 | static void print_rt_stats(struct seq_file *m, int cpu) | 2038 | void print_rt_stats(struct seq_file *m, int cpu) |
1839 | { | 2039 | { |
1840 | rt_rq_iter_t iter; | 2040 | rt_rq_iter_t iter; |
1841 | struct rt_rq *rt_rq; | 2041 | struct rt_rq *rt_rq; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000000..98c0c2623db8 --- /dev/null +++ b/kernel/sched/sched.h | |||
@@ -0,0 +1,1166 @@ | |||
1 | |||
2 | #include <linux/sched.h> | ||
3 | #include <linux/mutex.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/stop_machine.h> | ||
6 | |||
7 | #include "cpupri.h" | ||
8 | |||
9 | extern __read_mostly int scheduler_running; | ||
10 | |||
11 | /* | ||
12 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
13 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
14 | * and back. | ||
15 | */ | ||
16 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
17 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
18 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
19 | |||
20 | /* | ||
21 | * 'User priority' is the nice value converted to something we | ||
22 | * can work with better when scaling various scheduler parameters, | ||
23 | * it's a [ 0 ... 39 ] range. | ||
24 | */ | ||
25 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
26 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
27 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
28 | |||
29 | /* | ||
30 | * Helpers for converting nanosecond timing to jiffy resolution | ||
31 | */ | ||
32 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
33 | |||
34 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
35 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
36 | |||
37 | /* | ||
38 | * These are the 'tuning knobs' of the scheduler: | ||
39 | * | ||
40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
41 | * Timeslices get refilled after they expire. | ||
42 | */ | ||
43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
44 | |||
45 | /* | ||
46 | * single value that denotes runtime == period, ie unlimited time. | ||
47 | */ | ||
48 | #define RUNTIME_INF ((u64)~0ULL) | ||
49 | |||
50 | static inline int rt_policy(int policy) | ||
51 | { | ||
52 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
53 | return 1; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static inline int task_has_rt_policy(struct task_struct *p) | ||
58 | { | ||
59 | return rt_policy(p->policy); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is the priority-queue data structure of the RT scheduling class: | ||
64 | */ | ||
65 | struct rt_prio_array { | ||
66 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
67 | struct list_head queue[MAX_RT_PRIO]; | ||
68 | }; | ||
69 | |||
70 | struct rt_bandwidth { | ||
71 | /* nests inside the rq lock: */ | ||
72 | raw_spinlock_t rt_runtime_lock; | ||
73 | ktime_t rt_period; | ||
74 | u64 rt_runtime; | ||
75 | struct hrtimer rt_period_timer; | ||
76 | }; | ||
77 | |||
78 | extern struct mutex sched_domains_mutex; | ||
79 | |||
80 | #ifdef CONFIG_CGROUP_SCHED | ||
81 | |||
82 | #include <linux/cgroup.h> | ||
83 | |||
84 | struct cfs_rq; | ||
85 | struct rt_rq; | ||
86 | |||
87 | static LIST_HEAD(task_groups); | ||
88 | |||
89 | struct cfs_bandwidth { | ||
90 | #ifdef CONFIG_CFS_BANDWIDTH | ||
91 | raw_spinlock_t lock; | ||
92 | ktime_t period; | ||
93 | u64 quota, runtime; | ||
94 | s64 hierarchal_quota; | ||
95 | u64 runtime_expires; | ||
96 | |||
97 | int idle, timer_active; | ||
98 | struct hrtimer period_timer, slack_timer; | ||
99 | struct list_head throttled_cfs_rq; | ||
100 | |||
101 | /* statistics */ | ||
102 | int nr_periods, nr_throttled; | ||
103 | u64 throttled_time; | ||
104 | #endif | ||
105 | }; | ||
106 | |||
107 | /* task group related information */ | ||
108 | struct task_group { | ||
109 | struct cgroup_subsys_state css; | ||
110 | |||
111 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
112 | /* schedulable entities of this group on each cpu */ | ||
113 | struct sched_entity **se; | ||
114 | /* runqueue "owned" by this group on each cpu */ | ||
115 | struct cfs_rq **cfs_rq; | ||
116 | unsigned long shares; | ||
117 | |||
118 | atomic_t load_weight; | ||
119 | #endif | ||
120 | |||
121 | #ifdef CONFIG_RT_GROUP_SCHED | ||
122 | struct sched_rt_entity **rt_se; | ||
123 | struct rt_rq **rt_rq; | ||
124 | |||
125 | struct rt_bandwidth rt_bandwidth; | ||
126 | #endif | ||
127 | |||
128 | struct rcu_head rcu; | ||
129 | struct list_head list; | ||
130 | |||
131 | struct task_group *parent; | ||
132 | struct list_head siblings; | ||
133 | struct list_head children; | ||
134 | |||
135 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
136 | struct autogroup *autogroup; | ||
137 | #endif | ||
138 | |||
139 | struct cfs_bandwidth cfs_bandwidth; | ||
140 | }; | ||
141 | |||
142 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
143 | #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
144 | |||
145 | /* | ||
146 | * A weight of 0 or 1 can cause arithmetics problems. | ||
147 | * A weight of a cfs_rq is the sum of weights of which entities | ||
148 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
149 | * too large, so as the shares value of a task group. | ||
150 | * (The default weight is 1024 - so there's no practical | ||
151 | * limitation from this.) | ||
152 | */ | ||
153 | #define MIN_SHARES (1UL << 1) | ||
154 | #define MAX_SHARES (1UL << 18) | ||
155 | #endif | ||
156 | |||
157 | /* Default task group. | ||
158 | * Every task in system belong to this group at bootup. | ||
159 | */ | ||
160 | extern struct task_group root_task_group; | ||
161 | |||
162 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
163 | |||
164 | extern int walk_tg_tree_from(struct task_group *from, | ||
165 | tg_visitor down, tg_visitor up, void *data); | ||
166 | |||
167 | /* | ||
168 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
169 | * leaving it for the final time. | ||
170 | * | ||
171 | * Caller must hold rcu_lock or sufficient equivalent. | ||
172 | */ | ||
173 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
174 | { | ||
175 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
176 | } | ||
177 | |||
178 | extern int tg_nop(struct task_group *tg, void *data); | ||
179 | |||
180 | extern void free_fair_sched_group(struct task_group *tg); | ||
181 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | ||
182 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | ||
183 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
184 | struct sched_entity *se, int cpu, | ||
185 | struct sched_entity *parent); | ||
186 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
187 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
188 | |||
189 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | ||
190 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
191 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | ||
192 | |||
193 | extern void free_rt_sched_group(struct task_group *tg); | ||
194 | extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); | ||
195 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
196 | struct sched_rt_entity *rt_se, int cpu, | ||
197 | struct sched_rt_entity *parent); | ||
198 | |||
199 | #else /* CONFIG_CGROUP_SCHED */ | ||
200 | |||
201 | struct cfs_bandwidth { }; | ||
202 | |||
203 | #endif /* CONFIG_CGROUP_SCHED */ | ||
204 | |||
205 | /* CFS-related fields in a runqueue */ | ||
206 | struct cfs_rq { | ||
207 | struct load_weight load; | ||
208 | unsigned long nr_running, h_nr_running; | ||
209 | |||
210 | u64 exec_clock; | ||
211 | u64 min_vruntime; | ||
212 | #ifndef CONFIG_64BIT | ||
213 | u64 min_vruntime_copy; | ||
214 | #endif | ||
215 | |||
216 | struct rb_root tasks_timeline; | ||
217 | struct rb_node *rb_leftmost; | ||
218 | |||
219 | struct list_head tasks; | ||
220 | struct list_head *balance_iterator; | ||
221 | |||
222 | /* | ||
223 | * 'curr' points to currently running entity on this cfs_rq. | ||
224 | * It is set to NULL otherwise (i.e when none are currently running). | ||
225 | */ | ||
226 | struct sched_entity *curr, *next, *last, *skip; | ||
227 | |||
228 | #ifdef CONFIG_SCHED_DEBUG | ||
229 | unsigned int nr_spread_over; | ||
230 | #endif | ||
231 | |||
232 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
233 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
234 | |||
235 | /* | ||
236 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
237 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
238 | * (like users, containers etc.) | ||
239 | * | ||
240 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
241 | * list is used during load balance. | ||
242 | */ | ||
243 | int on_list; | ||
244 | struct list_head leaf_cfs_rq_list; | ||
245 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
246 | |||
247 | #ifdef CONFIG_SMP | ||
248 | /* | ||
249 | * the part of load.weight contributed by tasks | ||
250 | */ | ||
251 | unsigned long task_weight; | ||
252 | |||
253 | /* | ||
254 | * h_load = weight * f(tg) | ||
255 | * | ||
256 | * Where f(tg) is the recursive weight fraction assigned to | ||
257 | * this group. | ||
258 | */ | ||
259 | unsigned long h_load; | ||
260 | |||
261 | /* | ||
262 | * Maintaining per-cpu shares distribution for group scheduling | ||
263 | * | ||
264 | * load_stamp is the last time we updated the load average | ||
265 | * load_last is the last time we updated the load average and saw load | ||
266 | * load_unacc_exec_time is currently unaccounted execution time | ||
267 | */ | ||
268 | u64 load_avg; | ||
269 | u64 load_period; | ||
270 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
271 | |||
272 | unsigned long load_contribution; | ||
273 | #endif /* CONFIG_SMP */ | ||
274 | #ifdef CONFIG_CFS_BANDWIDTH | ||
275 | int runtime_enabled; | ||
276 | u64 runtime_expires; | ||
277 | s64 runtime_remaining; | ||
278 | |||
279 | u64 throttled_timestamp; | ||
280 | int throttled, throttle_count; | ||
281 | struct list_head throttled_list; | ||
282 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
283 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
284 | }; | ||
285 | |||
286 | static inline int rt_bandwidth_enabled(void) | ||
287 | { | ||
288 | return sysctl_sched_rt_runtime >= 0; | ||
289 | } | ||
290 | |||
291 | /* Real-Time classes' related field in a runqueue: */ | ||
292 | struct rt_rq { | ||
293 | struct rt_prio_array active; | ||
294 | unsigned long rt_nr_running; | ||
295 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
296 | struct { | ||
297 | int curr; /* highest queued rt task prio */ | ||
298 | #ifdef CONFIG_SMP | ||
299 | int next; /* next highest */ | ||
300 | #endif | ||
301 | } highest_prio; | ||
302 | #endif | ||
303 | #ifdef CONFIG_SMP | ||
304 | unsigned long rt_nr_migratory; | ||
305 | unsigned long rt_nr_total; | ||
306 | int overloaded; | ||
307 | struct plist_head pushable_tasks; | ||
308 | #endif | ||
309 | int rt_throttled; | ||
310 | u64 rt_time; | ||
311 | u64 rt_runtime; | ||
312 | /* Nests inside the rq lock: */ | ||
313 | raw_spinlock_t rt_runtime_lock; | ||
314 | |||
315 | #ifdef CONFIG_RT_GROUP_SCHED | ||
316 | unsigned long rt_nr_boosted; | ||
317 | |||
318 | struct rq *rq; | ||
319 | struct list_head leaf_rt_rq_list; | ||
320 | struct task_group *tg; | ||
321 | #endif | ||
322 | }; | ||
323 | |||
324 | #ifdef CONFIG_SMP | ||
325 | |||
326 | /* | ||
327 | * We add the notion of a root-domain which will be used to define per-domain | ||
328 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
329 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
330 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
331 | * object. | ||
332 | * | ||
333 | */ | ||
334 | struct root_domain { | ||
335 | atomic_t refcount; | ||
336 | atomic_t rto_count; | ||
337 | struct rcu_head rcu; | ||
338 | cpumask_var_t span; | ||
339 | cpumask_var_t online; | ||
340 | |||
341 | /* | ||
342 | * The "RT overload" flag: it gets set if a CPU has more than | ||
343 | * one runnable RT task. | ||
344 | */ | ||
345 | cpumask_var_t rto_mask; | ||
346 | struct cpupri cpupri; | ||
347 | }; | ||
348 | |||
349 | extern struct root_domain def_root_domain; | ||
350 | |||
351 | #endif /* CONFIG_SMP */ | ||
352 | |||
353 | /* | ||
354 | * This is the main, per-CPU runqueue data structure. | ||
355 | * | ||
356 | * Locking rule: those places that want to lock multiple runqueues | ||
357 | * (such as the load balancing or the thread migration code), lock | ||
358 | * acquire operations must be ordered by ascending &runqueue. | ||
359 | */ | ||
360 | struct rq { | ||
361 | /* runqueue lock: */ | ||
362 | raw_spinlock_t lock; | ||
363 | |||
364 | /* | ||
365 | * nr_running and cpu_load should be in the same cacheline because | ||
366 | * remote CPUs use both these fields when doing load calculation. | ||
367 | */ | ||
368 | unsigned long nr_running; | ||
369 | #define CPU_LOAD_IDX_MAX 5 | ||
370 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
371 | unsigned long last_load_update_tick; | ||
372 | #ifdef CONFIG_NO_HZ | ||
373 | u64 nohz_stamp; | ||
374 | unsigned long nohz_flags; | ||
375 | #endif | ||
376 | int skip_clock_update; | ||
377 | |||
378 | /* capture load from *all* tasks on this cpu: */ | ||
379 | struct load_weight load; | ||
380 | unsigned long nr_load_updates; | ||
381 | u64 nr_switches; | ||
382 | |||
383 | struct cfs_rq cfs; | ||
384 | struct rt_rq rt; | ||
385 | |||
386 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
387 | /* list of leaf cfs_rq on this cpu: */ | ||
388 | struct list_head leaf_cfs_rq_list; | ||
389 | #endif | ||
390 | #ifdef CONFIG_RT_GROUP_SCHED | ||
391 | struct list_head leaf_rt_rq_list; | ||
392 | #endif | ||
393 | |||
394 | /* | ||
395 | * This is part of a global counter where only the total sum | ||
396 | * over all CPUs matters. A task can increase this counter on | ||
397 | * one CPU and if it got migrated afterwards it may decrease | ||
398 | * it on another CPU. Always updated under the runqueue lock: | ||
399 | */ | ||
400 | unsigned long nr_uninterruptible; | ||
401 | |||
402 | struct task_struct *curr, *idle, *stop; | ||
403 | unsigned long next_balance; | ||
404 | struct mm_struct *prev_mm; | ||
405 | |||
406 | u64 clock; | ||
407 | u64 clock_task; | ||
408 | |||
409 | atomic_t nr_iowait; | ||
410 | |||
411 | #ifdef CONFIG_SMP | ||
412 | struct root_domain *rd; | ||
413 | struct sched_domain *sd; | ||
414 | |||
415 | unsigned long cpu_power; | ||
416 | |||
417 | unsigned char idle_balance; | ||
418 | /* For active balancing */ | ||
419 | int post_schedule; | ||
420 | int active_balance; | ||
421 | int push_cpu; | ||
422 | struct cpu_stop_work active_balance_work; | ||
423 | /* cpu of this runqueue: */ | ||
424 | int cpu; | ||
425 | int online; | ||
426 | |||
427 | u64 rt_avg; | ||
428 | u64 age_stamp; | ||
429 | u64 idle_stamp; | ||
430 | u64 avg_idle; | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
434 | u64 prev_irq_time; | ||
435 | #endif | ||
436 | #ifdef CONFIG_PARAVIRT | ||
437 | u64 prev_steal_time; | ||
438 | #endif | ||
439 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
440 | u64 prev_steal_time_rq; | ||
441 | #endif | ||
442 | |||
443 | /* calc_load related fields */ | ||
444 | unsigned long calc_load_update; | ||
445 | long calc_load_active; | ||
446 | |||
447 | #ifdef CONFIG_SCHED_HRTICK | ||
448 | #ifdef CONFIG_SMP | ||
449 | int hrtick_csd_pending; | ||
450 | struct call_single_data hrtick_csd; | ||
451 | #endif | ||
452 | struct hrtimer hrtick_timer; | ||
453 | #endif | ||
454 | |||
455 | #ifdef CONFIG_SCHEDSTATS | ||
456 | /* latency stats */ | ||
457 | struct sched_info rq_sched_info; | ||
458 | unsigned long long rq_cpu_time; | ||
459 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
460 | |||
461 | /* sys_sched_yield() stats */ | ||
462 | unsigned int yld_count; | ||
463 | |||
464 | /* schedule() stats */ | ||
465 | unsigned int sched_switch; | ||
466 | unsigned int sched_count; | ||
467 | unsigned int sched_goidle; | ||
468 | |||
469 | /* try_to_wake_up() stats */ | ||
470 | unsigned int ttwu_count; | ||
471 | unsigned int ttwu_local; | ||
472 | #endif | ||
473 | |||
474 | #ifdef CONFIG_SMP | ||
475 | struct llist_head wake_list; | ||
476 | #endif | ||
477 | }; | ||
478 | |||
479 | static inline int cpu_of(struct rq *rq) | ||
480 | { | ||
481 | #ifdef CONFIG_SMP | ||
482 | return rq->cpu; | ||
483 | #else | ||
484 | return 0; | ||
485 | #endif | ||
486 | } | ||
487 | |||
488 | DECLARE_PER_CPU(struct rq, runqueues); | ||
489 | |||
490 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
491 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
492 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
493 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
494 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
495 | |||
496 | #ifdef CONFIG_SMP | ||
497 | |||
498 | #define rcu_dereference_check_sched_domain(p) \ | ||
499 | rcu_dereference_check((p), \ | ||
500 | lockdep_is_held(&sched_domains_mutex)) | ||
501 | |||
502 | /* | ||
503 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
504 | * See detach_destroy_domains: synchronize_sched for details. | ||
505 | * | ||
506 | * The domain tree of any CPU may only be accessed from within | ||
507 | * preempt-disabled sections. | ||
508 | */ | ||
509 | #define for_each_domain(cpu, __sd) \ | ||
510 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ | ||
511 | __sd; __sd = __sd->parent) | ||
512 | |||
513 | #define for_each_lower_domain(sd) for (; sd; sd = sd->child) | ||
514 | |||
515 | /** | ||
516 | * highest_flag_domain - Return highest sched_domain containing flag. | ||
517 | * @cpu: The cpu whose highest level of sched domain is to | ||
518 | * be returned. | ||
519 | * @flag: The flag to check for the highest sched_domain | ||
520 | * for the given cpu. | ||
521 | * | ||
522 | * Returns the highest sched_domain of a cpu which contains the given flag. | ||
523 | */ | ||
524 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | ||
525 | { | ||
526 | struct sched_domain *sd, *hsd = NULL; | ||
527 | |||
528 | for_each_domain(cpu, sd) { | ||
529 | if (!(sd->flags & flag)) | ||
530 | break; | ||
531 | hsd = sd; | ||
532 | } | ||
533 | |||
534 | return hsd; | ||
535 | } | ||
536 | |||
537 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | ||
538 | DECLARE_PER_CPU(int, sd_llc_id); | ||
539 | |||
540 | #endif /* CONFIG_SMP */ | ||
541 | |||
542 | #include "stats.h" | ||
543 | #include "auto_group.h" | ||
544 | |||
545 | #ifdef CONFIG_CGROUP_SCHED | ||
546 | |||
547 | /* | ||
548 | * Return the group to which this tasks belongs. | ||
549 | * | ||
550 | * We use task_subsys_state_check() and extend the RCU verification with | ||
551 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
552 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
553 | * we pin the task to the current cgroup. | ||
554 | */ | ||
555 | static inline struct task_group *task_group(struct task_struct *p) | ||
556 | { | ||
557 | struct task_group *tg; | ||
558 | struct cgroup_subsys_state *css; | ||
559 | |||
560 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
561 | lockdep_is_held(&p->pi_lock) || | ||
562 | lockdep_is_held(&task_rq(p)->lock)); | ||
563 | tg = container_of(css, struct task_group, css); | ||
564 | |||
565 | return autogroup_task_group(p, tg); | ||
566 | } | ||
567 | |||
568 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
569 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
570 | { | ||
571 | #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) | ||
572 | struct task_group *tg = task_group(p); | ||
573 | #endif | ||
574 | |||
575 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
576 | p->se.cfs_rq = tg->cfs_rq[cpu]; | ||
577 | p->se.parent = tg->se[cpu]; | ||
578 | #endif | ||
579 | |||
580 | #ifdef CONFIG_RT_GROUP_SCHED | ||
581 | p->rt.rt_rq = tg->rt_rq[cpu]; | ||
582 | p->rt.parent = tg->rt_se[cpu]; | ||
583 | #endif | ||
584 | } | ||
585 | |||
586 | #else /* CONFIG_CGROUP_SCHED */ | ||
587 | |||
588 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
589 | static inline struct task_group *task_group(struct task_struct *p) | ||
590 | { | ||
591 | return NULL; | ||
592 | } | ||
593 | |||
594 | #endif /* CONFIG_CGROUP_SCHED */ | ||
595 | |||
596 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
597 | { | ||
598 | set_task_rq(p, cpu); | ||
599 | #ifdef CONFIG_SMP | ||
600 | /* | ||
601 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
602 | * successfuly executed on another CPU. We must ensure that updates of | ||
603 | * per-task data have been completed by this moment. | ||
604 | */ | ||
605 | smp_wmb(); | ||
606 | task_thread_info(p)->cpu = cpu; | ||
607 | #endif | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
612 | */ | ||
613 | #ifdef CONFIG_SCHED_DEBUG | ||
614 | # include <linux/jump_label.h> | ||
615 | # define const_debug __read_mostly | ||
616 | #else | ||
617 | # define const_debug const | ||
618 | #endif | ||
619 | |||
620 | extern const_debug unsigned int sysctl_sched_features; | ||
621 | |||
622 | #define SCHED_FEAT(name, enabled) \ | ||
623 | __SCHED_FEAT_##name , | ||
624 | |||
625 | enum { | ||
626 | #include "features.h" | ||
627 | __SCHED_FEAT_NR, | ||
628 | }; | ||
629 | |||
630 | #undef SCHED_FEAT | ||
631 | |||
632 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | ||
633 | static __always_inline bool static_branch__true(struct jump_label_key *key) | ||
634 | { | ||
635 | return likely(static_branch(key)); /* Not out of line branch. */ | ||
636 | } | ||
637 | |||
638 | static __always_inline bool static_branch__false(struct jump_label_key *key) | ||
639 | { | ||
640 | return unlikely(static_branch(key)); /* Out of line branch. */ | ||
641 | } | ||
642 | |||
643 | #define SCHED_FEAT(name, enabled) \ | ||
644 | static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | ||
645 | { \ | ||
646 | return static_branch__##enabled(key); \ | ||
647 | } | ||
648 | |||
649 | #include "features.h" | ||
650 | |||
651 | #undef SCHED_FEAT | ||
652 | |||
653 | extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; | ||
654 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | ||
655 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | ||
656 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
657 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | ||
658 | |||
659 | static inline u64 global_rt_period(void) | ||
660 | { | ||
661 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
662 | } | ||
663 | |||
664 | static inline u64 global_rt_runtime(void) | ||
665 | { | ||
666 | if (sysctl_sched_rt_runtime < 0) | ||
667 | return RUNTIME_INF; | ||
668 | |||
669 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
670 | } | ||
671 | |||
672 | |||
673 | |||
674 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
675 | { | ||
676 | return rq->curr == p; | ||
677 | } | ||
678 | |||
679 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
680 | { | ||
681 | #ifdef CONFIG_SMP | ||
682 | return p->on_cpu; | ||
683 | #else | ||
684 | return task_current(rq, p); | ||
685 | #endif | ||
686 | } | ||
687 | |||
688 | |||
689 | #ifndef prepare_arch_switch | ||
690 | # define prepare_arch_switch(next) do { } while (0) | ||
691 | #endif | ||
692 | #ifndef finish_arch_switch | ||
693 | # define finish_arch_switch(prev) do { } while (0) | ||
694 | #endif | ||
695 | |||
696 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
697 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
698 | { | ||
699 | #ifdef CONFIG_SMP | ||
700 | /* | ||
701 | * We can optimise this out completely for !SMP, because the | ||
702 | * SMP rebalancing from interrupt is the only thing that cares | ||
703 | * here. | ||
704 | */ | ||
705 | next->on_cpu = 1; | ||
706 | #endif | ||
707 | } | ||
708 | |||
709 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
710 | { | ||
711 | #ifdef CONFIG_SMP | ||
712 | /* | ||
713 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
714 | * We must ensure this doesn't happen until the switch is completely | ||
715 | * finished. | ||
716 | */ | ||
717 | smp_wmb(); | ||
718 | prev->on_cpu = 0; | ||
719 | #endif | ||
720 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
721 | /* this is a valid case when another task releases the spinlock */ | ||
722 | rq->lock.owner = current; | ||
723 | #endif | ||
724 | /* | ||
725 | * If we are tracking spinlock dependencies then we have to | ||
726 | * fix up the runqueue lock - which gets 'carried over' from | ||
727 | * prev into current: | ||
728 | */ | ||
729 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
730 | |||
731 | raw_spin_unlock_irq(&rq->lock); | ||
732 | } | ||
733 | |||
734 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
735 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
736 | { | ||
737 | #ifdef CONFIG_SMP | ||
738 | /* | ||
739 | * We can optimise this out completely for !SMP, because the | ||
740 | * SMP rebalancing from interrupt is the only thing that cares | ||
741 | * here. | ||
742 | */ | ||
743 | next->on_cpu = 1; | ||
744 | #endif | ||
745 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
746 | raw_spin_unlock_irq(&rq->lock); | ||
747 | #else | ||
748 | raw_spin_unlock(&rq->lock); | ||
749 | #endif | ||
750 | } | ||
751 | |||
752 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
753 | { | ||
754 | #ifdef CONFIG_SMP | ||
755 | /* | ||
756 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
757 | * We must ensure this doesn't happen until the switch is completely | ||
758 | * finished. | ||
759 | */ | ||
760 | smp_wmb(); | ||
761 | prev->on_cpu = 0; | ||
762 | #endif | ||
763 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
764 | local_irq_enable(); | ||
765 | #endif | ||
766 | } | ||
767 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
768 | |||
769 | |||
770 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
771 | { | ||
772 | lw->weight += inc; | ||
773 | lw->inv_weight = 0; | ||
774 | } | ||
775 | |||
776 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
777 | { | ||
778 | lw->weight -= dec; | ||
779 | lw->inv_weight = 0; | ||
780 | } | ||
781 | |||
782 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
783 | { | ||
784 | lw->weight = w; | ||
785 | lw->inv_weight = 0; | ||
786 | } | ||
787 | |||
788 | /* | ||
789 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
790 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
791 | * each task makes to its run queue's load is weighted according to its | ||
792 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
793 | * scaled version of the new time slice allocation that they receive on time | ||
794 | * slice expiry etc. | ||
795 | */ | ||
796 | |||
797 | #define WEIGHT_IDLEPRIO 3 | ||
798 | #define WMULT_IDLEPRIO 1431655765 | ||
799 | |||
800 | /* | ||
801 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
802 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
803 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
804 | * that remained on nice 0. | ||
805 | * | ||
806 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
807 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
808 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
809 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
810 | * the relative distance between them is ~25%.) | ||
811 | */ | ||
812 | static const int prio_to_weight[40] = { | ||
813 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
814 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
815 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
816 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
817 | /* 0 */ 1024, 820, 655, 526, 423, | ||
818 | /* 5 */ 335, 272, 215, 172, 137, | ||
819 | /* 10 */ 110, 87, 70, 56, 45, | ||
820 | /* 15 */ 36, 29, 23, 18, 15, | ||
821 | }; | ||
822 | |||
823 | /* | ||
824 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
825 | * | ||
826 | * In cases where the weight does not change often, we can use the | ||
827 | * precalculated inverse to speed up arithmetics by turning divisions | ||
828 | * into multiplications: | ||
829 | */ | ||
830 | static const u32 prio_to_wmult[40] = { | ||
831 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
832 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
833 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
834 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
835 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
836 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
837 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
838 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
839 | }; | ||
840 | |||
841 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
842 | enum cpuacct_stat_index { | ||
843 | CPUACCT_STAT_USER, /* ... user mode */ | ||
844 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
845 | |||
846 | CPUACCT_STAT_NSTATS, | ||
847 | }; | ||
848 | |||
849 | |||
850 | #define sched_class_highest (&stop_sched_class) | ||
851 | #define for_each_class(class) \ | ||
852 | for (class = sched_class_highest; class; class = class->next) | ||
853 | |||
854 | extern const struct sched_class stop_sched_class; | ||
855 | extern const struct sched_class rt_sched_class; | ||
856 | extern const struct sched_class fair_sched_class; | ||
857 | extern const struct sched_class idle_sched_class; | ||
858 | |||
859 | |||
860 | #ifdef CONFIG_SMP | ||
861 | |||
862 | extern void trigger_load_balance(struct rq *rq, int cpu); | ||
863 | extern void idle_balance(int this_cpu, struct rq *this_rq); | ||
864 | |||
865 | #else /* CONFIG_SMP */ | ||
866 | |||
867 | static inline void idle_balance(int cpu, struct rq *rq) | ||
868 | { | ||
869 | } | ||
870 | |||
871 | #endif | ||
872 | |||
873 | extern void sysrq_sched_debug_show(void); | ||
874 | extern void sched_init_granularity(void); | ||
875 | extern void update_max_interval(void); | ||
876 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
877 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
878 | extern void init_sched_rt_class(void); | ||
879 | extern void init_sched_fair_class(void); | ||
880 | |||
881 | extern void resched_task(struct task_struct *p); | ||
882 | extern void resched_cpu(int cpu); | ||
883 | |||
884 | extern struct rt_bandwidth def_rt_bandwidth; | ||
885 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | ||
886 | |||
887 | extern void update_cpu_load(struct rq *this_rq); | ||
888 | |||
889 | #ifdef CONFIG_CGROUP_CPUACCT | ||
890 | #include <linux/cgroup.h> | ||
891 | /* track cpu usage of a group of tasks and its child groups */ | ||
892 | struct cpuacct { | ||
893 | struct cgroup_subsys_state css; | ||
894 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
895 | u64 __percpu *cpuusage; | ||
896 | struct kernel_cpustat __percpu *cpustat; | ||
897 | }; | ||
898 | |||
899 | /* return cpu accounting group corresponding to this container */ | ||
900 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
901 | { | ||
902 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
903 | struct cpuacct, css); | ||
904 | } | ||
905 | |||
906 | /* return cpu accounting group to which this task belongs */ | ||
907 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
908 | { | ||
909 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
910 | struct cpuacct, css); | ||
911 | } | ||
912 | |||
913 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
914 | { | ||
915 | if (!ca || !ca->css.cgroup->parent) | ||
916 | return NULL; | ||
917 | return cgroup_ca(ca->css.cgroup->parent); | ||
918 | } | ||
919 | |||
920 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
921 | #else | ||
922 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
923 | #endif | ||
924 | |||
925 | static inline void inc_nr_running(struct rq *rq) | ||
926 | { | ||
927 | rq->nr_running++; | ||
928 | } | ||
929 | |||
930 | static inline void dec_nr_running(struct rq *rq) | ||
931 | { | ||
932 | rq->nr_running--; | ||
933 | } | ||
934 | |||
935 | extern void update_rq_clock(struct rq *rq); | ||
936 | |||
937 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | ||
938 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | ||
939 | |||
940 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
941 | |||
942 | extern const_debug unsigned int sysctl_sched_time_avg; | ||
943 | extern const_debug unsigned int sysctl_sched_nr_migrate; | ||
944 | extern const_debug unsigned int sysctl_sched_migration_cost; | ||
945 | |||
946 | static inline u64 sched_avg_period(void) | ||
947 | { | ||
948 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
949 | } | ||
950 | |||
951 | void calc_load_account_idle(struct rq *this_rq); | ||
952 | |||
953 | #ifdef CONFIG_SCHED_HRTICK | ||
954 | |||
955 | /* | ||
956 | * Use hrtick when: | ||
957 | * - enabled by features | ||
958 | * - hrtimer is actually high res | ||
959 | */ | ||
960 | static inline int hrtick_enabled(struct rq *rq) | ||
961 | { | ||
962 | if (!sched_feat(HRTICK)) | ||
963 | return 0; | ||
964 | if (!cpu_active(cpu_of(rq))) | ||
965 | return 0; | ||
966 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
967 | } | ||
968 | |||
969 | void hrtick_start(struct rq *rq, u64 delay); | ||
970 | |||
971 | #else | ||
972 | |||
973 | static inline int hrtick_enabled(struct rq *rq) | ||
974 | { | ||
975 | return 0; | ||
976 | } | ||
977 | |||
978 | #endif /* CONFIG_SCHED_HRTICK */ | ||
979 | |||
980 | #ifdef CONFIG_SMP | ||
981 | extern void sched_avg_update(struct rq *rq); | ||
982 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
983 | { | ||
984 | rq->rt_avg += rt_delta; | ||
985 | sched_avg_update(rq); | ||
986 | } | ||
987 | #else | ||
988 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | ||
989 | static inline void sched_avg_update(struct rq *rq) { } | ||
990 | #endif | ||
991 | |||
992 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); | ||
993 | |||
994 | #ifdef CONFIG_SMP | ||
995 | #ifdef CONFIG_PREEMPT | ||
996 | |||
997 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
998 | |||
999 | /* | ||
1000 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1001 | * way at the expense of forcing extra atomic operations in all | ||
1002 | * invocations. This assures that the double_lock is acquired using the | ||
1003 | * same underlying policy as the spinlock_t on this architecture, which | ||
1004 | * reduces latency compared to the unfair variant below. However, it | ||
1005 | * also adds more overhead and therefore may reduce throughput. | ||
1006 | */ | ||
1007 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1008 | __releases(this_rq->lock) | ||
1009 | __acquires(busiest->lock) | ||
1010 | __acquires(this_rq->lock) | ||
1011 | { | ||
1012 | raw_spin_unlock(&this_rq->lock); | ||
1013 | double_rq_lock(this_rq, busiest); | ||
1014 | |||
1015 | return 1; | ||
1016 | } | ||
1017 | |||
1018 | #else | ||
1019 | /* | ||
1020 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1021 | * latency by eliminating extra atomic operations when the locks are | ||
1022 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1023 | * grant the double lock to lower cpus over higher ids under contention, | ||
1024 | * regardless of entry order into the function. | ||
1025 | */ | ||
1026 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1027 | __releases(this_rq->lock) | ||
1028 | __acquires(busiest->lock) | ||
1029 | __acquires(this_rq->lock) | ||
1030 | { | ||
1031 | int ret = 0; | ||
1032 | |||
1033 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1034 | if (busiest < this_rq) { | ||
1035 | raw_spin_unlock(&this_rq->lock); | ||
1036 | raw_spin_lock(&busiest->lock); | ||
1037 | raw_spin_lock_nested(&this_rq->lock, | ||
1038 | SINGLE_DEPTH_NESTING); | ||
1039 | ret = 1; | ||
1040 | } else | ||
1041 | raw_spin_lock_nested(&busiest->lock, | ||
1042 | SINGLE_DEPTH_NESTING); | ||
1043 | } | ||
1044 | return ret; | ||
1045 | } | ||
1046 | |||
1047 | #endif /* CONFIG_PREEMPT */ | ||
1048 | |||
1049 | /* | ||
1050 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1051 | */ | ||
1052 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1053 | { | ||
1054 | if (unlikely(!irqs_disabled())) { | ||
1055 | /* printk() doesn't work good under rq->lock */ | ||
1056 | raw_spin_unlock(&this_rq->lock); | ||
1057 | BUG_ON(1); | ||
1058 | } | ||
1059 | |||
1060 | return _double_lock_balance(this_rq, busiest); | ||
1061 | } | ||
1062 | |||
1063 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1064 | __releases(busiest->lock) | ||
1065 | { | ||
1066 | raw_spin_unlock(&busiest->lock); | ||
1067 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1068 | } | ||
1069 | |||
1070 | /* | ||
1071 | * double_rq_lock - safely lock two runqueues | ||
1072 | * | ||
1073 | * Note this does not disable interrupts like task_rq_lock, | ||
1074 | * you need to do so manually before calling. | ||
1075 | */ | ||
1076 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1077 | __acquires(rq1->lock) | ||
1078 | __acquires(rq2->lock) | ||
1079 | { | ||
1080 | BUG_ON(!irqs_disabled()); | ||
1081 | if (rq1 == rq2) { | ||
1082 | raw_spin_lock(&rq1->lock); | ||
1083 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1084 | } else { | ||
1085 | if (rq1 < rq2) { | ||
1086 | raw_spin_lock(&rq1->lock); | ||
1087 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1088 | } else { | ||
1089 | raw_spin_lock(&rq2->lock); | ||
1090 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1091 | } | ||
1092 | } | ||
1093 | } | ||
1094 | |||
1095 | /* | ||
1096 | * double_rq_unlock - safely unlock two runqueues | ||
1097 | * | ||
1098 | * Note this does not restore interrupts like task_rq_unlock, | ||
1099 | * you need to do so manually after calling. | ||
1100 | */ | ||
1101 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1102 | __releases(rq1->lock) | ||
1103 | __releases(rq2->lock) | ||
1104 | { | ||
1105 | raw_spin_unlock(&rq1->lock); | ||
1106 | if (rq1 != rq2) | ||
1107 | raw_spin_unlock(&rq2->lock); | ||
1108 | else | ||
1109 | __release(rq2->lock); | ||
1110 | } | ||
1111 | |||
1112 | #else /* CONFIG_SMP */ | ||
1113 | |||
1114 | /* | ||
1115 | * double_rq_lock - safely lock two runqueues | ||
1116 | * | ||
1117 | * Note this does not disable interrupts like task_rq_lock, | ||
1118 | * you need to do so manually before calling. | ||
1119 | */ | ||
1120 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1121 | __acquires(rq1->lock) | ||
1122 | __acquires(rq2->lock) | ||
1123 | { | ||
1124 | BUG_ON(!irqs_disabled()); | ||
1125 | BUG_ON(rq1 != rq2); | ||
1126 | raw_spin_lock(&rq1->lock); | ||
1127 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * double_rq_unlock - safely unlock two runqueues | ||
1132 | * | ||
1133 | * Note this does not restore interrupts like task_rq_unlock, | ||
1134 | * you need to do so manually after calling. | ||
1135 | */ | ||
1136 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1137 | __releases(rq1->lock) | ||
1138 | __releases(rq2->lock) | ||
1139 | { | ||
1140 | BUG_ON(rq1 != rq2); | ||
1141 | raw_spin_unlock(&rq1->lock); | ||
1142 | __release(rq2->lock); | ||
1143 | } | ||
1144 | |||
1145 | #endif | ||
1146 | |||
1147 | extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | ||
1148 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | ||
1149 | extern void print_cfs_stats(struct seq_file *m, int cpu); | ||
1150 | extern void print_rt_stats(struct seq_file *m, int cpu); | ||
1151 | |||
1152 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | ||
1153 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | ||
1154 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
1155 | |||
1156 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | ||
1157 | |||
1158 | #ifdef CONFIG_NO_HZ | ||
1159 | enum rq_nohz_flag_bits { | ||
1160 | NOHZ_TICK_STOPPED, | ||
1161 | NOHZ_BALANCE_KICK, | ||
1162 | NOHZ_IDLE, | ||
1163 | }; | ||
1164 | |||
1165 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | ||
1166 | #endif | ||
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000000..2a581ba8e190 --- /dev/null +++ b/kernel/sched/stats.c | |||
@@ -0,0 +1,111 @@ | |||
1 | |||
2 | #include <linux/slab.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/proc_fs.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | /* | ||
10 | * bump this up when changing the output format or the meaning of an existing | ||
11 | * format, so that tools can adapt (or abort) | ||
12 | */ | ||
13 | #define SCHEDSTAT_VERSION 15 | ||
14 | |||
15 | static int show_schedstat(struct seq_file *seq, void *v) | ||
16 | { | ||
17 | int cpu; | ||
18 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
19 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
20 | |||
21 | if (mask_str == NULL) | ||
22 | return -ENOMEM; | ||
23 | |||
24 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
25 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
26 | for_each_online_cpu(cpu) { | ||
27 | struct rq *rq = cpu_rq(cpu); | ||
28 | #ifdef CONFIG_SMP | ||
29 | struct sched_domain *sd; | ||
30 | int dcount = 0; | ||
31 | #endif | ||
32 | |||
33 | /* runqueue-specific stats */ | ||
34 | seq_printf(seq, | ||
35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
36 | cpu, rq->yld_count, | ||
37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
38 | rq->ttwu_count, rq->ttwu_local, | ||
39 | rq->rq_cpu_time, | ||
40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
41 | |||
42 | seq_printf(seq, "\n"); | ||
43 | |||
44 | #ifdef CONFIG_SMP | ||
45 | /* domain-specific stats */ | ||
46 | rcu_read_lock(); | ||
47 | for_each_domain(cpu, sd) { | ||
48 | enum cpu_idle_type itype; | ||
49 | |||
50 | cpumask_scnprintf(mask_str, mask_len, | ||
51 | sched_domain_span(sd)); | ||
52 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
53 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
54 | itype++) { | ||
55 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
56 | sd->lb_count[itype], | ||
57 | sd->lb_balanced[itype], | ||
58 | sd->lb_failed[itype], | ||
59 | sd->lb_imbalance[itype], | ||
60 | sd->lb_gained[itype], | ||
61 | sd->lb_hot_gained[itype], | ||
62 | sd->lb_nobusyq[itype], | ||
63 | sd->lb_nobusyg[itype]); | ||
64 | } | ||
65 | seq_printf(seq, | ||
66 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
67 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
68 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
69 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
70 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
71 | sd->ttwu_move_balance); | ||
72 | } | ||
73 | rcu_read_unlock(); | ||
74 | #endif | ||
75 | } | ||
76 | kfree(mask_str); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static int schedstat_open(struct inode *inode, struct file *file) | ||
81 | { | ||
82 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
83 | char *buf = kmalloc(size, GFP_KERNEL); | ||
84 | struct seq_file *m; | ||
85 | int res; | ||
86 | |||
87 | if (!buf) | ||
88 | return -ENOMEM; | ||
89 | res = single_open(file, show_schedstat, NULL); | ||
90 | if (!res) { | ||
91 | m = file->private_data; | ||
92 | m->buf = buf; | ||
93 | m->size = size; | ||
94 | } else | ||
95 | kfree(buf); | ||
96 | return res; | ||
97 | } | ||
98 | |||
99 | static const struct file_operations proc_schedstat_operations = { | ||
100 | .open = schedstat_open, | ||
101 | .read = seq_read, | ||
102 | .llseek = seq_lseek, | ||
103 | .release = single_release, | ||
104 | }; | ||
105 | |||
106 | static int __init proc_schedstat_init(void) | ||
107 | { | ||
108 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
109 | return 0; | ||
110 | } | ||
111 | module_init(proc_schedstat_init); | ||
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 87f9e36ea56e..2ef90a51ec5e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h | |||
@@ -1,108 +1,5 @@ | |||
1 | 1 | ||
2 | #ifdef CONFIG_SCHEDSTATS | 2 | #ifdef CONFIG_SCHEDSTATS |
3 | /* | ||
4 | * bump this up when changing the output format or the meaning of an existing | ||
5 | * format, so that tools can adapt (or abort) | ||
6 | */ | ||
7 | #define SCHEDSTAT_VERSION 15 | ||
8 | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | ||
10 | { | ||
11 | int cpu; | ||
12 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
14 | |||
15 | if (mask_str == NULL) | ||
16 | return -ENOMEM; | ||
17 | |||
18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
19 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
20 | for_each_online_cpu(cpu) { | ||
21 | struct rq *rq = cpu_rq(cpu); | ||
22 | #ifdef CONFIG_SMP | ||
23 | struct sched_domain *sd; | ||
24 | int dcount = 0; | ||
25 | #endif | ||
26 | |||
27 | /* runqueue-specific stats */ | ||
28 | seq_printf(seq, | ||
29 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
30 | cpu, rq->yld_count, | ||
31 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
32 | rq->ttwu_count, rq->ttwu_local, | ||
33 | rq->rq_cpu_time, | ||
34 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
35 | |||
36 | seq_printf(seq, "\n"); | ||
37 | |||
38 | #ifdef CONFIG_SMP | ||
39 | /* domain-specific stats */ | ||
40 | rcu_read_lock(); | ||
41 | for_each_domain(cpu, sd) { | ||
42 | enum cpu_idle_type itype; | ||
43 | |||
44 | cpumask_scnprintf(mask_str, mask_len, | ||
45 | sched_domain_span(sd)); | ||
46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
48 | itype++) { | ||
49 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
50 | sd->lb_count[itype], | ||
51 | sd->lb_balanced[itype], | ||
52 | sd->lb_failed[itype], | ||
53 | sd->lb_imbalance[itype], | ||
54 | sd->lb_gained[itype], | ||
55 | sd->lb_hot_gained[itype], | ||
56 | sd->lb_nobusyq[itype], | ||
57 | sd->lb_nobusyg[itype]); | ||
58 | } | ||
59 | seq_printf(seq, | ||
60 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
61 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
62 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
63 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
65 | sd->ttwu_move_balance); | ||
66 | } | ||
67 | rcu_read_unlock(); | ||
68 | #endif | ||
69 | } | ||
70 | kfree(mask_str); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static int schedstat_open(struct inode *inode, struct file *file) | ||
75 | { | ||
76 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
77 | char *buf = kmalloc(size, GFP_KERNEL); | ||
78 | struct seq_file *m; | ||
79 | int res; | ||
80 | |||
81 | if (!buf) | ||
82 | return -ENOMEM; | ||
83 | res = single_open(file, show_schedstat, NULL); | ||
84 | if (!res) { | ||
85 | m = file->private_data; | ||
86 | m->buf = buf; | ||
87 | m->size = size; | ||
88 | } else | ||
89 | kfree(buf); | ||
90 | return res; | ||
91 | } | ||
92 | |||
93 | static const struct file_operations proc_schedstat_operations = { | ||
94 | .open = schedstat_open, | ||
95 | .read = seq_read, | ||
96 | .llseek = seq_lseek, | ||
97 | .release = single_release, | ||
98 | }; | ||
99 | |||
100 | static int __init proc_schedstat_init(void) | ||
101 | { | ||
102 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
103 | return 0; | ||
104 | } | ||
105 | module_init(proc_schedstat_init); | ||
106 | 3 | ||
107 | /* | 4 | /* |
108 | * Expects runqueue lock to be held for atomicity of update | 5 | * Expects runqueue lock to be held for atomicity of update |
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
283 | return; | 180 | return; |
284 | 181 | ||
285 | raw_spin_lock(&cputimer->lock); | 182 | raw_spin_lock(&cputimer->lock); |
286 | cputimer->cputime.utime = | 183 | cputimer->cputime.utime += cputime; |
287 | cputime_add(cputimer->cputime.utime, cputime); | ||
288 | raw_spin_unlock(&cputimer->lock); | 184 | raw_spin_unlock(&cputimer->lock); |
289 | } | 185 | } |
290 | 186 | ||
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
307 | return; | 203 | return; |
308 | 204 | ||
309 | raw_spin_lock(&cputimer->lock); | 205 | raw_spin_lock(&cputimer->lock); |
310 | cputimer->cputime.stime = | 206 | cputimer->cputime.stime += cputime; |
311 | cputime_add(cputimer->cputime.stime, cputime); | ||
312 | raw_spin_unlock(&cputimer->lock); | 207 | raw_spin_unlock(&cputimer->lock); |
313 | } | 208 | } |
314 | 209 | ||
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 8b44e7fa7fb3..7b386e86fd23 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * stop-task scheduling class. | 4 | * stop-task scheduling class. |
3 | * | 5 | * |
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
80 | /* | 82 | /* |
81 | * Simple, special scheduling class for the per-CPU stop tasks: | 83 | * Simple, special scheduling class for the per-CPU stop tasks: |
82 | */ | 84 | */ |
83 | static const struct sched_class stop_sched_class = { | 85 | const struct sched_class stop_sched_class = { |
84 | .next = &rt_sched_class, | 86 | .next = &rt_sched_class, |
85 | 87 | ||
86 | .enqueue_task = enqueue_task_stop, | 88 | .enqueue_task = enqueue_task_stop, |
diff --git a/kernel/signal.c b/kernel/signal.c index 206551563cce..56ce3a618b28 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1629 | info.si_uid = __task_cred(tsk)->uid; | 1629 | info.si_uid = __task_cred(tsk)->uid; |
1630 | rcu_read_unlock(); | 1630 | rcu_read_unlock(); |
1631 | 1631 | ||
1632 | info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, | 1632 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
1633 | tsk->signal->utime)); | 1633 | info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); |
1634 | info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, | ||
1635 | tsk->signal->stime)); | ||
1636 | 1634 | ||
1637 | info.si_status = tsk->exit_code & 0x7f; | 1635 | info.si_status = tsk->exit_code & 0x7f; |
1638 | if (tsk->exit_code & 0x80) | 1636 | if (tsk->exit_code & 0x80) |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c71d91efff0..4eb3a0fa351e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -347,12 +347,12 @@ void irq_exit(void) | |||
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
348 | invoke_softirq(); | 348 | invoke_softirq(); |
349 | 349 | ||
350 | rcu_irq_exit(); | ||
351 | #ifdef CONFIG_NO_HZ | 350 | #ifdef CONFIG_NO_HZ |
352 | /* Make sure that timer wheel updates are propagated */ | 351 | /* Make sure that timer wheel updates are propagated */ |
353 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | 352 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) |
354 | tick_nohz_stop_sched_tick(0); | 353 | tick_nohz_irq_exit(); |
355 | #endif | 354 | #endif |
355 | rcu_irq_exit(); | ||
356 | preempt_enable_no_resched(); | 356 | preempt_enable_no_resched(); |
357 | } | 357 | } |
358 | 358 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 481611fbd079..ddf8155bf3f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1605 | unsigned long maxrss = 0; | 1605 | unsigned long maxrss = 0; |
1606 | 1606 | ||
1607 | memset((char *) r, 0, sizeof *r); | 1607 | memset((char *) r, 0, sizeof *r); |
1608 | utime = stime = cputime_zero; | 1608 | utime = stime = 0; |
1609 | 1609 | ||
1610 | if (who == RUSAGE_THREAD) { | 1610 | if (who == RUSAGE_THREAD) { |
1611 | task_times(current, &utime, &stime); | 1611 | task_times(current, &utime, &stime); |
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1635 | 1635 | ||
1636 | case RUSAGE_SELF: | 1636 | case RUSAGE_SELF: |
1637 | thread_group_times(p, &tgutime, &tgstime); | 1637 | thread_group_times(p, &tgutime, &tgstime); |
1638 | utime = cputime_add(utime, tgutime); | 1638 | utime += tgutime; |
1639 | stime = cputime_add(stime, tgstime); | 1639 | stime += tgstime; |
1640 | r->ru_nvcsw += p->signal->nvcsw; | 1640 | r->ru_nvcsw += p->signal->nvcsw; |
1641 | r->ru_nivcsw += p->signal->nivcsw; | 1641 | r->ru_nivcsw += p->signal->nivcsw; |
1642 | r->ru_minflt += p->signal->min_flt; | 1642 | r->ru_minflt += p->signal->min_flt; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
275 | } | 275 | } |
276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
277 | 277 | ||
278 | /** | 278 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
279 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
280 | * | ||
281 | * When the next event is more than a tick into the future, stop the idle tick | ||
282 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
283 | * just interrupted by an interrupt which did not cause a reschedule. | ||
284 | */ | ||
285 | void tick_nohz_stop_sched_tick(int inidle) | ||
286 | { | 279 | { |
287 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 280 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
288 | struct tick_sched *ts; | ||
289 | ktime_t last_update, expires, now; | 281 | ktime_t last_update, expires, now; |
290 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 282 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
291 | u64 time_delta; | 283 | u64 time_delta; |
292 | int cpu; | 284 | int cpu; |
293 | 285 | ||
294 | local_irq_save(flags); | ||
295 | |||
296 | cpu = smp_processor_id(); | 286 | cpu = smp_processor_id(); |
297 | ts = &per_cpu(tick_cpu_sched, cpu); | 287 | ts = &per_cpu(tick_cpu_sched, cpu); |
298 | 288 | ||
299 | /* | ||
300 | * Call to tick_nohz_start_idle stops the last_update_time from being | ||
301 | * updated. Thus, it must not be called in the event we are called from | ||
302 | * irq_exit() with the prior state different than idle. | ||
303 | */ | ||
304 | if (!inidle && !ts->inidle) | ||
305 | goto end; | ||
306 | |||
307 | /* | ||
308 | * Set ts->inidle unconditionally. Even if the system did not | ||
309 | * switch to NOHZ mode the cpu frequency governers rely on the | ||
310 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
311 | */ | ||
312 | ts->inidle = 1; | ||
313 | |||
314 | now = tick_nohz_start_idle(cpu, ts); | 289 | now = tick_nohz_start_idle(cpu, ts); |
315 | 290 | ||
316 | /* | 291 | /* |
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
326 | } | 301 | } |
327 | 302 | ||
328 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 303 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
329 | goto end; | 304 | return; |
330 | 305 | ||
331 | if (need_resched()) | 306 | if (need_resched()) |
332 | goto end; | 307 | return; |
333 | 308 | ||
334 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 309 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
335 | static int ratelimit; | 310 | static int ratelimit; |
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
339 | (unsigned int) local_softirq_pending()); | 314 | (unsigned int) local_softirq_pending()); |
340 | ratelimit++; | 315 | ratelimit++; |
341 | } | 316 | } |
342 | goto end; | 317 | return; |
343 | } | 318 | } |
344 | 319 | ||
345 | ts->idle_calls++; | 320 | ts->idle_calls++; |
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
434 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
435 | ts->tick_stopped = 1; | 410 | ts->tick_stopped = 1; |
436 | ts->idle_jiffies = last_jiffies; | 411 | ts->idle_jiffies = last_jiffies; |
437 | rcu_enter_nohz(); | ||
438 | } | 412 | } |
439 | 413 | ||
440 | ts->idle_sleeps++; | 414 | ts->idle_sleeps++; |
@@ -472,8 +446,64 @@ out: | |||
472 | ts->next_jiffies = next_jiffies; | 446 | ts->next_jiffies = next_jiffies; |
473 | ts->last_jiffies = last_jiffies; | 447 | ts->last_jiffies = last_jiffies; |
474 | ts->sleep_length = ktime_sub(dev->next_event, now); | 448 | ts->sleep_length = ktime_sub(dev->next_event, now); |
475 | end: | 449 | } |
476 | local_irq_restore(flags); | 450 | |
451 | /** | ||
452 | * tick_nohz_idle_enter - stop the idle tick from the idle task | ||
453 | * | ||
454 | * When the next event is more than a tick into the future, stop the idle tick | ||
455 | * Called when we start the idle loop. | ||
456 | * | ||
457 | * The arch is responsible of calling: | ||
458 | * | ||
459 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | ||
460 | * to sleep. | ||
461 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | ||
462 | */ | ||
463 | void tick_nohz_idle_enter(void) | ||
464 | { | ||
465 | struct tick_sched *ts; | ||
466 | |||
467 | WARN_ON_ONCE(irqs_disabled()); | ||
468 | |||
469 | /* | ||
470 | * Update the idle state in the scheduler domain hierarchy | ||
471 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
472 | * State will be updated to busy during the first busy tick after | ||
473 | * exiting idle. | ||
474 | */ | ||
475 | set_cpu_sd_state_idle(); | ||
476 | |||
477 | local_irq_disable(); | ||
478 | |||
479 | ts = &__get_cpu_var(tick_cpu_sched); | ||
480 | /* | ||
481 | * set ts->inidle unconditionally. even if the system did not | ||
482 | * switch to nohz mode the cpu frequency governers rely on the | ||
483 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
484 | */ | ||
485 | ts->inidle = 1; | ||
486 | tick_nohz_stop_sched_tick(ts); | ||
487 | |||
488 | local_irq_enable(); | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * tick_nohz_irq_exit - update next tick event from interrupt exit | ||
493 | * | ||
494 | * When an interrupt fires while we are idle and it doesn't cause | ||
495 | * a reschedule, it may still add, modify or delete a timer, enqueue | ||
496 | * an RCU callback, etc... | ||
497 | * So we need to re-calculate and reprogram the next tick event. | ||
498 | */ | ||
499 | void tick_nohz_irq_exit(void) | ||
500 | { | ||
501 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
502 | |||
503 | if (!ts->inidle) | ||
504 | return; | ||
505 | |||
506 | tick_nohz_stop_sched_tick(ts); | ||
477 | } | 507 | } |
478 | 508 | ||
479 | /** | 509 | /** |
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
515 | } | 545 | } |
516 | 546 | ||
517 | /** | 547 | /** |
518 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task | 548 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
519 | * | 549 | * |
520 | * Restart the idle tick when the CPU is woken up from idle | 550 | * Restart the idle tick when the CPU is woken up from idle |
551 | * This also exit the RCU extended quiescent state. The CPU | ||
552 | * can use RCU again after this function is called. | ||
521 | */ | 553 | */ |
522 | void tick_nohz_restart_sched_tick(void) | 554 | void tick_nohz_idle_exit(void) |
523 | { | 555 | { |
524 | int cpu = smp_processor_id(); | 556 | int cpu = smp_processor_id(); |
525 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 557 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) | |||
529 | ktime_t now; | 561 | ktime_t now; |
530 | 562 | ||
531 | local_irq_disable(); | 563 | local_irq_disable(); |
564 | |||
532 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) |
533 | now = ktime_get(); | 566 | now = ktime_get(); |
534 | 567 | ||
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void) | |||
543 | 576 | ||
544 | ts->inidle = 0; | 577 | ts->inidle = 0; |
545 | 578 | ||
546 | rcu_exit_nohz(); | ||
547 | |||
548 | /* Update jiffies first */ | 579 | /* Update jiffies first */ |
549 | select_nohz_load_balancer(0); | 580 | select_nohz_load_balancer(0); |
550 | tick_do_update_jiffies64(now); | 581 | tick_do_update_jiffies64(now); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 237841378c03..0c6358186401 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
131 | /* calculate the delta since the last update_wall_time: */ | 131 | /* calculate the delta since the last update_wall_time: */ |
132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
133 | 133 | ||
134 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 134 | /* return delta convert to nanoseconds. */ |
135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
136 | } | 136 | } |
137 | 137 | ||
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset) | |||
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | 813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. |
814 | * | 814 | * |
815 | * Note we subtract one in the shift, so that error is really error*2. | 815 | * Note we subtract one in the shift, so that error is really error*2. |
816 | * This "saves" dividing(shifting) intererval twice, but keeps the | 816 | * This "saves" dividing(shifting) interval twice, but keeps the |
817 | * (error > interval) comparision as still measuring if error is | 817 | * (error > interval) comparison as still measuring if error is |
818 | * larger then half an interval. | 818 | * larger then half an interval. |
819 | * | 819 | * |
820 | * Note: It does not "save" on aggrivation when reading the code. | 820 | * Note: It does not "save" on aggravation when reading the code. |
821 | */ | 821 | */ |
822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
823 | if (error > interval) { | 823 | if (error > interval) { |
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset) | |||
833 | * nanosecond, and store the amount rounded up into | 833 | * nanosecond, and store the amount rounded up into |
834 | * the error. This causes the likely below to be unlikely. | 834 | * the error. This causes the likely below to be unlikely. |
835 | * | 835 | * |
836 | * The properfix is to avoid rounding up by using | 836 | * The proper fix is to avoid rounding up by using |
837 | * the high precision timekeeper.xtime_nsec instead of | 837 | * the high precision timekeeper.xtime_nsec instead of |
838 | * xtime.tv_nsec everywhere. Fixing this will take some | 838 | * xtime.tv_nsec everywhere. Fixing this will take some |
839 | * time. | 839 | * time. |
diff --git a/kernel/timer.c b/kernel/timer.c index 9c3c62b0c4bc..a297ffcf888e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) | |||
427 | } | 427 | } |
428 | } | 428 | } |
429 | 429 | ||
430 | /* Stub timer callback for improperly used timers. */ | ||
431 | static void stub_timer(unsigned long data) | ||
432 | { | ||
433 | WARN_ON(1); | ||
434 | } | ||
435 | |||
430 | /* | 436 | /* |
431 | * fixup_activate is called when: | 437 | * fixup_activate is called when: |
432 | * - an active object is activated | 438 | * - an active object is activated |
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) | |||
450 | debug_object_activate(timer, &timer_debug_descr); | 456 | debug_object_activate(timer, &timer_debug_descr); |
451 | return 0; | 457 | return 0; |
452 | } else { | 458 | } else { |
453 | WARN_ON_ONCE(1); | 459 | setup_timer(timer, stub_timer, 0); |
460 | return 1; | ||
454 | } | 461 | } |
455 | return 0; | 462 | return 0; |
456 | 463 | ||
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
480 | } | 487 | } |
481 | } | 488 | } |
482 | 489 | ||
490 | /* | ||
491 | * fixup_assert_init is called when: | ||
492 | * - an untracked/uninit-ed object is found | ||
493 | */ | ||
494 | static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) | ||
495 | { | ||
496 | struct timer_list *timer = addr; | ||
497 | |||
498 | switch (state) { | ||
499 | case ODEBUG_STATE_NOTAVAILABLE: | ||
500 | if (timer->entry.prev == TIMER_ENTRY_STATIC) { | ||
501 | /* | ||
502 | * This is not really a fixup. The timer was | ||
503 | * statically initialized. We just make sure that it | ||
504 | * is tracked in the object tracker. | ||
505 | */ | ||
506 | debug_object_init(timer, &timer_debug_descr); | ||
507 | return 0; | ||
508 | } else { | ||
509 | setup_timer(timer, stub_timer, 0); | ||
510 | return 1; | ||
511 | } | ||
512 | default: | ||
513 | return 0; | ||
514 | } | ||
515 | } | ||
516 | |||
483 | static struct debug_obj_descr timer_debug_descr = { | 517 | static struct debug_obj_descr timer_debug_descr = { |
484 | .name = "timer_list", | 518 | .name = "timer_list", |
485 | .debug_hint = timer_debug_hint, | 519 | .debug_hint = timer_debug_hint, |
486 | .fixup_init = timer_fixup_init, | 520 | .fixup_init = timer_fixup_init, |
487 | .fixup_activate = timer_fixup_activate, | 521 | .fixup_activate = timer_fixup_activate, |
488 | .fixup_free = timer_fixup_free, | 522 | .fixup_free = timer_fixup_free, |
523 | .fixup_assert_init = timer_fixup_assert_init, | ||
489 | }; | 524 | }; |
490 | 525 | ||
491 | static inline void debug_timer_init(struct timer_list *timer) | 526 | static inline void debug_timer_init(struct timer_list *timer) |
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer) | |||
508 | debug_object_free(timer, &timer_debug_descr); | 543 | debug_object_free(timer, &timer_debug_descr); |
509 | } | 544 | } |
510 | 545 | ||
546 | static inline void debug_timer_assert_init(struct timer_list *timer) | ||
547 | { | ||
548 | debug_object_assert_init(timer, &timer_debug_descr); | ||
549 | } | ||
550 | |||
511 | static void __init_timer(struct timer_list *timer, | 551 | static void __init_timer(struct timer_list *timer, |
512 | const char *name, | 552 | const char *name, |
513 | struct lock_class_key *key); | 553 | struct lock_class_key *key); |
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); | |||
531 | static inline void debug_timer_init(struct timer_list *timer) { } | 571 | static inline void debug_timer_init(struct timer_list *timer) { } |
532 | static inline void debug_timer_activate(struct timer_list *timer) { } | 572 | static inline void debug_timer_activate(struct timer_list *timer) { } |
533 | static inline void debug_timer_deactivate(struct timer_list *timer) { } | 573 | static inline void debug_timer_deactivate(struct timer_list *timer) { } |
574 | static inline void debug_timer_assert_init(struct timer_list *timer) { } | ||
534 | #endif | 575 | #endif |
535 | 576 | ||
536 | static inline void debug_init(struct timer_list *timer) | 577 | static inline void debug_init(struct timer_list *timer) |
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer) | |||
552 | trace_timer_cancel(timer); | 593 | trace_timer_cancel(timer); |
553 | } | 594 | } |
554 | 595 | ||
596 | static inline void debug_assert_init(struct timer_list *timer) | ||
597 | { | ||
598 | debug_timer_assert_init(timer); | ||
599 | } | ||
600 | |||
555 | static void __init_timer(struct timer_list *timer, | 601 | static void __init_timer(struct timer_list *timer, |
556 | const char *name, | 602 | const char *name, |
557 | struct lock_class_key *key) | 603 | struct lock_class_key *key) |
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer) | |||
902 | unsigned long flags; | 948 | unsigned long flags; |
903 | int ret = 0; | 949 | int ret = 0; |
904 | 950 | ||
951 | debug_assert_init(timer); | ||
952 | |||
905 | timer_stats_timer_clear_start_info(timer); | 953 | timer_stats_timer_clear_start_info(timer); |
906 | if (timer_pending(timer)) { | 954 | if (timer_pending(timer)) { |
907 | base = lock_timer_base(timer, &flags); | 955 | base = lock_timer_base(timer, &flags); |
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
932 | unsigned long flags; | 980 | unsigned long flags; |
933 | int ret = -1; | 981 | int ret = -1; |
934 | 982 | ||
983 | debug_assert_init(timer); | ||
984 | |||
935 | base = lock_timer_base(timer, &flags); | 985 | base = lock_timer_base(timer, &flags); |
936 | 986 | ||
937 | if (base->running_timer == timer) | 987 | if (base->running_timer == timer) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f..91dc4bc8bf72 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
338 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
342 | TRACE_ITER_IRQ_INFO; | ||
342 | 343 | ||
343 | static int trace_stop_count; | 344 | static int trace_stop_count; |
344 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 345 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
@@ -426,6 +427,7 @@ static const char *trace_options[] = { | |||
426 | "record-cmd", | 427 | "record-cmd", |
427 | "overwrite", | 428 | "overwrite", |
428 | "disable_on_free", | 429 | "disable_on_free", |
430 | "irq-info", | ||
429 | NULL | 431 | NULL |
430 | }; | 432 | }; |
431 | 433 | ||
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) | |||
1843 | trace_event_read_unlock(); | 1845 | trace_event_read_unlock(); |
1844 | } | 1846 | } |
1845 | 1847 | ||
1848 | static void | ||
1849 | get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) | ||
1850 | { | ||
1851 | unsigned long count; | ||
1852 | int cpu; | ||
1853 | |||
1854 | *total = 0; | ||
1855 | *entries = 0; | ||
1856 | |||
1857 | for_each_tracing_cpu(cpu) { | ||
1858 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1859 | /* | ||
1860 | * If this buffer has skipped entries, then we hold all | ||
1861 | * entries for the trace and we need to ignore the | ||
1862 | * ones before the time stamp. | ||
1863 | */ | ||
1864 | if (tr->data[cpu]->skipped_entries) { | ||
1865 | count -= tr->data[cpu]->skipped_entries; | ||
1866 | /* total is the same as the entries */ | ||
1867 | *total += count; | ||
1868 | } else | ||
1869 | *total += count + | ||
1870 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1871 | *entries += count; | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1846 | static void print_lat_help_header(struct seq_file *m) | 1875 | static void print_lat_help_header(struct seq_file *m) |
1847 | { | 1876 | { |
1848 | seq_puts(m, "# _------=> CPU# \n"); | 1877 | seq_puts(m, "# _------=> CPU# \n"); |
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m) | |||
1855 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 1884 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1856 | } | 1885 | } |
1857 | 1886 | ||
1858 | static void print_func_help_header(struct seq_file *m) | 1887 | static void print_event_info(struct trace_array *tr, struct seq_file *m) |
1888 | { | ||
1889 | unsigned long total; | ||
1890 | unsigned long entries; | ||
1891 | |||
1892 | get_total_entries(tr, &total, &entries); | ||
1893 | seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", | ||
1894 | entries, total, num_online_cpus()); | ||
1895 | seq_puts(m, "#\n"); | ||
1896 | } | ||
1897 | |||
1898 | static void print_func_help_header(struct trace_array *tr, struct seq_file *m) | ||
1859 | { | 1899 | { |
1860 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 1900 | print_event_info(tr, m); |
1901 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | ||
1861 | seq_puts(m, "# | | | | |\n"); | 1902 | seq_puts(m, "# | | | | |\n"); |
1862 | } | 1903 | } |
1863 | 1904 | ||
1905 | static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) | ||
1906 | { | ||
1907 | print_event_info(tr, m); | ||
1908 | seq_puts(m, "# _-----=> irqs-off\n"); | ||
1909 | seq_puts(m, "# / _----=> need-resched\n"); | ||
1910 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | ||
1911 | seq_puts(m, "# || / _--=> preempt-depth\n"); | ||
1912 | seq_puts(m, "# ||| / delay\n"); | ||
1913 | seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); | ||
1914 | seq_puts(m, "# | | | |||| | |\n"); | ||
1915 | } | ||
1864 | 1916 | ||
1865 | void | 1917 | void |
1866 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) | 1918 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) |
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
1869 | struct trace_array *tr = iter->tr; | 1921 | struct trace_array *tr = iter->tr; |
1870 | struct trace_array_cpu *data = tr->data[tr->cpu]; | 1922 | struct trace_array_cpu *data = tr->data[tr->cpu]; |
1871 | struct tracer *type = current_trace; | 1923 | struct tracer *type = current_trace; |
1872 | unsigned long entries = 0; | 1924 | unsigned long entries; |
1873 | unsigned long total = 0; | 1925 | unsigned long total; |
1874 | unsigned long count; | ||
1875 | const char *name = "preemption"; | 1926 | const char *name = "preemption"; |
1876 | int cpu; | ||
1877 | 1927 | ||
1878 | if (type) | 1928 | if (type) |
1879 | name = type->name; | 1929 | name = type->name; |
1880 | 1930 | ||
1881 | 1931 | get_total_entries(tr, &total, &entries); | |
1882 | for_each_tracing_cpu(cpu) { | ||
1883 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1884 | /* | ||
1885 | * If this buffer has skipped entries, then we hold all | ||
1886 | * entries for the trace and we need to ignore the | ||
1887 | * ones before the time stamp. | ||
1888 | */ | ||
1889 | if (tr->data[cpu]->skipped_entries) { | ||
1890 | count -= tr->data[cpu]->skipped_entries; | ||
1891 | /* total is the same as the entries */ | ||
1892 | total += count; | ||
1893 | } else | ||
1894 | total += count + | ||
1895 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1896 | entries += count; | ||
1897 | } | ||
1898 | 1932 | ||
1899 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", | 1933 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", |
1900 | name, UTS_RELEASE); | 1934 | name, UTS_RELEASE); |
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2140 | return print_trace_fmt(iter); | 2174 | return print_trace_fmt(iter); |
2141 | } | 2175 | } |
2142 | 2176 | ||
2177 | void trace_latency_header(struct seq_file *m) | ||
2178 | { | ||
2179 | struct trace_iterator *iter = m->private; | ||
2180 | |||
2181 | /* print nothing if the buffers are empty */ | ||
2182 | if (trace_empty(iter)) | ||
2183 | return; | ||
2184 | |||
2185 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) | ||
2186 | print_trace_header(m, iter); | ||
2187 | |||
2188 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | ||
2189 | print_lat_help_header(m); | ||
2190 | } | ||
2191 | |||
2143 | void trace_default_header(struct seq_file *m) | 2192 | void trace_default_header(struct seq_file *m) |
2144 | { | 2193 | { |
2145 | struct trace_iterator *iter = m->private; | 2194 | struct trace_iterator *iter = m->private; |
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m) | |||
2155 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2204 | if (!(trace_flags & TRACE_ITER_VERBOSE)) |
2156 | print_lat_help_header(m); | 2205 | print_lat_help_header(m); |
2157 | } else { | 2206 | } else { |
2158 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2207 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
2159 | print_func_help_header(m); | 2208 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
2209 | print_func_help_header_irq(iter->tr, m); | ||
2210 | else | ||
2211 | print_func_help_header(iter->tr, m); | ||
2212 | } | ||
2160 | } | 2213 | } |
2161 | } | 2214 | } |
2162 | 2215 | ||
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4775 | { | 4828 | { |
4776 | __ftrace_dump(true, oops_dump_mode); | 4829 | __ftrace_dump(true, oops_dump_mode); |
4777 | } | 4830 | } |
4831 | EXPORT_SYMBOL_GPL(ftrace_dump); | ||
4778 | 4832 | ||
4779 | __init static int tracer_alloc_buffers(void) | 4833 | __init static int tracer_alloc_buffers(void) |
4780 | { | 4834 | { |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..2c2657462ac3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, | |||
370 | unsigned long ip, | 370 | unsigned long ip, |
371 | unsigned long parent_ip, | 371 | unsigned long parent_ip, |
372 | unsigned long flags, int pc); | 372 | unsigned long flags, int pc); |
373 | void trace_latency_header(struct seq_file *m); | ||
373 | void trace_default_header(struct seq_file *m); | 374 | void trace_default_header(struct seq_file *m); |
374 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 375 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
375 | int trace_empty(struct trace_iterator *iter); | 376 | int trace_empty(struct trace_iterator *iter); |
@@ -654,6 +655,7 @@ enum trace_iterator_flags { | |||
654 | TRACE_ITER_RECORD_CMD = 0x100000, | 655 | TRACE_ITER_RECORD_CMD = 0x100000, |
655 | TRACE_ITER_OVERWRITE = 0x200000, | 656 | TRACE_ITER_OVERWRITE = 0x200000, |
656 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 657 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
658 | TRACE_ITER_IRQ_INFO = 0x800000, | ||
657 | }; | 659 | }; |
658 | 660 | ||
659 | /* | 661 | /* |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 95dc31efd6dd..f04cc3136bd3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -27,6 +27,12 @@ | |||
27 | #include "trace.h" | 27 | #include "trace.h" |
28 | #include "trace_output.h" | 28 | #include "trace_output.h" |
29 | 29 | ||
30 | #define DEFAULT_SYS_FILTER_MESSAGE \ | ||
31 | "### global filter ###\n" \ | ||
32 | "# Use this to set filters for multiple events.\n" \ | ||
33 | "# Only events with the given fields will be affected.\n" \ | ||
34 | "# If no events are modified, an error message will be displayed here" | ||
35 | |||
30 | enum filter_op_ids | 36 | enum filter_op_ids |
31 | { | 37 | { |
32 | OP_OR, | 38 | OP_OR, |
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
646 | if (filter && filter->filter_string) | 652 | if (filter && filter->filter_string) |
647 | trace_seq_printf(s, "%s\n", filter->filter_string); | 653 | trace_seq_printf(s, "%s\n", filter->filter_string); |
648 | else | 654 | else |
649 | trace_seq_printf(s, "none\n"); | 655 | trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); |
650 | mutex_unlock(&event_mutex); | 656 | mutex_unlock(&event_mutex); |
651 | } | 657 | } |
652 | 658 | ||
@@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1838 | if (!filter) | 1844 | if (!filter) |
1839 | goto out; | 1845 | goto out; |
1840 | 1846 | ||
1841 | replace_filter_string(filter, filter_string); | 1847 | /* System filters just show a default message */ |
1848 | kfree(filter->filter_string); | ||
1849 | filter->filter_string = NULL; | ||
1850 | |||
1842 | /* | 1851 | /* |
1843 | * No event actually uses the system filter | 1852 | * No event actually uses the system filter |
1844 | * we can free it without synchronize_sched(). | 1853 | * we can free it without synchronize_sched(). |
@@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1848 | 1857 | ||
1849 | parse_init(ps, filter_ops, filter_string); | 1858 | parse_init(ps, filter_ops, filter_string); |
1850 | err = filter_parse(ps); | 1859 | err = filter_parse(ps); |
1851 | if (err) { | 1860 | if (err) |
1852 | append_filter_err(ps, system->filter); | 1861 | goto err_filter; |
1853 | goto out; | ||
1854 | } | ||
1855 | 1862 | ||
1856 | err = replace_system_preds(system, ps, filter_string); | 1863 | err = replace_system_preds(system, ps, filter_string); |
1857 | if (err) | 1864 | if (err) |
1858 | append_filter_err(ps, system->filter); | 1865 | goto err_filter; |
1859 | 1866 | ||
1860 | out: | 1867 | out: |
1861 | filter_opstack_clear(ps); | 1868 | filter_opstack_clear(ps); |
@@ -1865,6 +1872,11 @@ out_unlock: | |||
1865 | mutex_unlock(&event_mutex); | 1872 | mutex_unlock(&event_mutex); |
1866 | 1873 | ||
1867 | return err; | 1874 | return err; |
1875 | |||
1876 | err_filter: | ||
1877 | replace_filter_string(filter, filter_string); | ||
1878 | append_filter_err(ps, system->filter); | ||
1879 | goto out; | ||
1868 | } | 1880 | } |
1869 | 1881 | ||
1870 | #ifdef CONFIG_PERF_EVENTS | 1882 | #ifdef CONFIG_PERF_EVENTS |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 20dad0d7a163..99d20e920368 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void irqsoff_print_header(struct seq_file *s) { } | ||
284 | static void irqsoff_trace_open(struct trace_iterator *iter) { } | 283 | static void irqsoff_trace_open(struct trace_iterator *iter) { } |
285 | static void irqsoff_trace_close(struct trace_iterator *iter) { } | 284 | static void irqsoff_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void irqsoff_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void irqsoff_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) | |||
627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
628 | unsigned long secs = (unsigned long)t; | 628 | unsigned long secs = (unsigned long)t; |
629 | char comm[TASK_COMM_LEN]; | 629 | char comm[TASK_COMM_LEN]; |
630 | int ret; | ||
630 | 631 | ||
631 | trace_find_cmdline(entry->pid, comm); | 632 | trace_find_cmdline(entry->pid, comm); |
632 | 633 | ||
633 | return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", | 634 | ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", |
634 | comm, entry->pid, iter->cpu, secs, usec_rem); | 635 | comm, entry->pid, iter->cpu); |
636 | if (!ret) | ||
637 | return 0; | ||
638 | |||
639 | if (trace_flags & TRACE_ITER_IRQ_INFO) { | ||
640 | ret = trace_print_lat_fmt(s, entry); | ||
641 | if (!ret) | ||
642 | return 0; | ||
643 | } | ||
644 | |||
645 | return trace_seq_printf(s, " %5lu.%06lu: ", | ||
646 | secs, usec_rem); | ||
635 | } | 647 | } |
636 | 648 | ||
637 | int trace_print_lat_context(struct trace_iterator *iter) | 649 | int trace_print_lat_context(struct trace_iterator *iter) |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void wakeup_print_header(struct seq_file *s) { } | ||
284 | static void wakeup_trace_open(struct trace_iterator *iter) { } | 283 | static void wakeup_trace_open(struct trace_iterator *iter) { } |
285 | static void wakeup_trace_close(struct trace_iterator *iter) { } | 284 | static void wakeup_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void wakeup_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void wakeup_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 5bbfac85866e..23b4d784ebdd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
127 | 127 | ||
128 | local_irq_save(flags); | 128 | local_irq_save(flags); |
129 | time = tsk->stime + tsk->utime; | 129 | time = tsk->stime + tsk->utime; |
130 | dtime = cputime_sub(time, tsk->acct_timexpd); | 130 | dtime = time - tsk->acct_timexpd; |
131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); |
132 | delta = value.tv_sec; | 132 | delta = value.tv_sec; |
133 | delta = delta * USEC_PER_SEC + value.tv_usec; | 133 | delta = delta * USEC_PER_SEC + value.tv_usec; |
diff --git a/kernel/wait.c b/kernel/wait.c index 26fa7797f90f..7fdd9eaca2c3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -10,10 +10,10 @@ | |||
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
12 | 12 | ||
13 | void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) | 13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
14 | { | 14 | { |
15 | spin_lock_init(&q->lock); | 15 | spin_lock_init(&q->lock); |
16 | lockdep_set_class(&q->lock, key); | 16 | lockdep_set_class_and_name(&q->lock, key, name); |
17 | INIT_LIST_HEAD(&q->task_list); | 17 | INIT_LIST_HEAD(&q->task_list); |
18 | } | 18 | } |
19 | 19 | ||