diff options
Diffstat (limited to 'kernel')
57 files changed, 2590 insertions, 2529 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100eaa..d2b32ac27a39 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH | |||
138 | 138 | ||
139 | config INLINE_SPIN_UNLOCK_IRQ | 139 | config INLINE_SPIN_UNLOCK_IRQ |
140 | def_bool y | 140 | def_bool y |
141 | depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH | 141 | depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ |
142 | 142 | ||
143 | config INLINE_SPIN_UNLOCK_IRQRESTORE | 143 | config INLINE_SPIN_UNLOCK_IRQRESTORE |
144 | def_bool y | 144 | def_bool y |
@@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH | |||
175 | 175 | ||
176 | config INLINE_READ_UNLOCK_IRQ | 176 | config INLINE_READ_UNLOCK_IRQ |
177 | def_bool y | 177 | def_bool y |
178 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH | 178 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ |
179 | 179 | ||
180 | config INLINE_READ_UNLOCK_IRQRESTORE | 180 | config INLINE_READ_UNLOCK_IRQRESTORE |
181 | def_bool y | 181 | def_bool y |
@@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH | |||
212 | 212 | ||
213 | config INLINE_WRITE_UNLOCK_IRQ | 213 | config INLINE_WRITE_UNLOCK_IRQ |
214 | def_bool y | 214 | def_bool y |
215 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH | 215 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ |
216 | 216 | ||
217 | config INLINE_WRITE_UNLOCK_IRQRESTORE | 217 | config INLINE_WRITE_UNLOCK_IRQRESTORE |
218 | def_bool y | 218 | def_bool y |
diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd3..91e53d04b6a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
1056 | static void wait_for_auditd(unsigned long sleep_time) | 1056 | static void wait_for_auditd(unsigned long sleep_time) |
1057 | { | 1057 | { |
1058 | DECLARE_WAITQUEUE(wait, current); | 1058 | DECLARE_WAITQUEUE(wait, current); |
1059 | set_current_state(TASK_INTERRUPTIBLE); | 1059 | set_current_state(TASK_UNINTERRUPTIBLE); |
1060 | add_wait_queue(&audit_backlog_wait, &wait); | 1060 | add_wait_queue(&audit_backlog_wait, &wait); |
1061 | 1061 | ||
1062 | if (audit_backlog_limit && | 1062 | if (audit_backlog_limit && |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) | |||
658 | struct vfsmount *mnt; | 658 | struct vfsmount *mnt; |
659 | int err; | 659 | int err; |
660 | 660 | ||
661 | rule->tree = NULL; | ||
661 | list_for_each_entry(tree, &tree_list, list) { | 662 | list_for_each_entry(tree, &tree_list, list) { |
662 | if (!strcmp(seed->pathname, tree->pathname)) { | 663 | if (!strcmp(seed->pathname, tree->pathname)) { |
663 | put_tree(seed); | 664 | put_tree(seed); |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..383f8231e436 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -15,7 +15,6 @@ | |||
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/context_tracking.h> | 17 | #include <linux/context_tracking.h> |
18 | #include <linux/kvm_host.h> | ||
19 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
20 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
21 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
@@ -71,6 +70,46 @@ void user_enter(void) | |||
71 | local_irq_restore(flags); | 70 | local_irq_restore(flags); |
72 | } | 71 | } |
73 | 72 | ||
73 | #ifdef CONFIG_PREEMPT | ||
74 | /** | ||
75 | * preempt_schedule_context - preempt_schedule called by tracing | ||
76 | * | ||
77 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
78 | * recursion and tracing preempt enabling caused by the tracing | ||
79 | * infrastructure itself. But as tracing can happen in areas coming | ||
80 | * from userspace or just about to enter userspace, a preempt enable | ||
81 | * can occur before user_exit() is called. This will cause the scheduler | ||
82 | * to be called when the system is still in usermode. | ||
83 | * | ||
84 | * To prevent this, the preempt_enable_notrace will use this function | ||
85 | * instead of preempt_schedule() to exit user context if needed before | ||
86 | * calling the scheduler. | ||
87 | */ | ||
88 | void __sched notrace preempt_schedule_context(void) | ||
89 | { | ||
90 | struct thread_info *ti = current_thread_info(); | ||
91 | enum ctx_state prev_ctx; | ||
92 | |||
93 | if (likely(ti->preempt_count || irqs_disabled())) | ||
94 | return; | ||
95 | |||
96 | /* | ||
97 | * Need to disable preemption in case user_exit() is traced | ||
98 | * and the tracer calls preempt_enable_notrace() causing | ||
99 | * an infinite recursion. | ||
100 | */ | ||
101 | preempt_disable_notrace(); | ||
102 | prev_ctx = exception_enter(); | ||
103 | preempt_enable_no_resched_notrace(); | ||
104 | |||
105 | preempt_schedule(); | ||
106 | |||
107 | preempt_disable_notrace(); | ||
108 | exception_exit(prev_ctx); | ||
109 | preempt_enable_notrace(); | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
112 | #endif /* CONFIG_PREEMPT */ | ||
74 | 113 | ||
75 | /** | 114 | /** |
76 | * user_exit - Inform the context tracking that the CPU is | 115 | * user_exit - Inform the context tracking that the CPU is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) | |||
133 | mutex_unlock(&cpu_hotplug.lock); | 133 | mutex_unlock(&cpu_hotplug.lock); |
134 | } | 134 | } |
135 | 135 | ||
136 | /* | ||
137 | * Wait for currently running CPU hotplug operations to complete (if any) and | ||
138 | * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects | ||
139 | * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the | ||
140 | * hotplug path before performing hotplug operations. So acquiring that lock | ||
141 | * guarantees mutual exclusion from any currently running hotplug operations. | ||
142 | */ | ||
143 | void cpu_hotplug_disable(void) | ||
144 | { | ||
145 | cpu_maps_update_begin(); | ||
146 | cpu_hotplug_disabled = 1; | ||
147 | cpu_maps_update_done(); | ||
148 | } | ||
149 | |||
150 | void cpu_hotplug_enable(void) | ||
151 | { | ||
152 | cpu_maps_update_begin(); | ||
153 | cpu_hotplug_disabled = 0; | ||
154 | cpu_maps_update_done(); | ||
155 | } | ||
156 | |||
136 | #else /* #if CONFIG_HOTPLUG_CPU */ | 157 | #else /* #if CONFIG_HOTPLUG_CPU */ |
137 | static void cpu_hotplug_begin(void) {} | 158 | static void cpu_hotplug_begin(void) {} |
138 | static void cpu_hotplug_done(void) {} | 159 | static void cpu_hotplug_done(void) {} |
@@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void) | |||
541 | core_initcall(alloc_frozen_cpus); | 562 | core_initcall(alloc_frozen_cpus); |
542 | 563 | ||
543 | /* | 564 | /* |
544 | * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU | ||
545 | * hotplug when tasks are about to be frozen. Also, don't allow the freezer | ||
546 | * to continue until any currently running CPU hotplug operation gets | ||
547 | * completed. | ||
548 | * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the | ||
549 | * 'cpu_add_remove_lock'. And this same lock is also taken by the regular | ||
550 | * CPU hotplug path and released only after it is complete. Thus, we | ||
551 | * (and hence the freezer) will block here until any currently running CPU | ||
552 | * hotplug operation gets completed. | ||
553 | */ | ||
554 | void cpu_hotplug_disable_before_freeze(void) | ||
555 | { | ||
556 | cpu_maps_update_begin(); | ||
557 | cpu_hotplug_disabled = 1; | ||
558 | cpu_maps_update_done(); | ||
559 | } | ||
560 | |||
561 | |||
562 | /* | ||
563 | * When tasks have been thawed, re-enable regular CPU hotplug (which had been | ||
564 | * disabled while beginning to freeze tasks). | ||
565 | */ | ||
566 | void cpu_hotplug_enable_after_thaw(void) | ||
567 | { | ||
568 | cpu_maps_update_begin(); | ||
569 | cpu_hotplug_disabled = 0; | ||
570 | cpu_maps_update_done(); | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * When callbacks for CPU hotplug notifications are being executed, we must | 565 | * When callbacks for CPU hotplug notifications are being executed, we must |
575 | * ensure that the state of the system with respect to the tasks being frozen | 566 | * ensure that the state of the system with respect to the tasks being frozen |
576 | * or not, as reported by the notification, remains unchanged *throughout the | 567 | * or not, as reported by the notification, remains unchanged *throughout the |
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, | |||
589 | 580 | ||
590 | case PM_SUSPEND_PREPARE: | 581 | case PM_SUSPEND_PREPARE: |
591 | case PM_HIBERNATION_PREPARE: | 582 | case PM_HIBERNATION_PREPARE: |
592 | cpu_hotplug_disable_before_freeze(); | 583 | cpu_hotplug_disable(); |
593 | break; | 584 | break; |
594 | 585 | ||
595 | case PM_POST_SUSPEND: | 586 | case PM_POST_SUSPEND: |
596 | case PM_POST_HIBERNATION: | 587 | case PM_POST_HIBERNATION: |
597 | cpu_hotplug_enable_after_thaw(); | 588 | cpu_hotplug_enable(); |
598 | break; | 589 | break; |
599 | 590 | ||
600 | default: | 591 | default: |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/cpu.h> | 5 | #include <linux/cpu.h> |
6 | #include <linux/tick.h> | 6 | #include <linux/tick.h> |
7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
8 | #include <linux/stackprotector.h> | ||
8 | 9 | ||
9 | #include <asm/tlb.h> | 10 | #include <asm/tlb.h> |
10 | 11 | ||
@@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } | |||
58 | void __weak arch_cpu_idle(void) | 59 | void __weak arch_cpu_idle(void) |
59 | { | 60 | { |
60 | cpu_idle_force_poll = 1; | 61 | cpu_idle_force_poll = 1; |
62 | local_irq_enable(); | ||
61 | } | 63 | } |
62 | 64 | ||
63 | /* | 65 | /* |
@@ -112,6 +114,21 @@ static void cpu_idle_loop(void) | |||
112 | 114 | ||
113 | void cpu_startup_entry(enum cpuhp_state state) | 115 | void cpu_startup_entry(enum cpuhp_state state) |
114 | { | 116 | { |
117 | /* | ||
118 | * This #ifdef needs to die, but it's too late in the cycle to | ||
119 | * make this generic (arm and sh have never invoked the canary | ||
120 | * init for the non boot cpus!). Will be fixed in 3.11 | ||
121 | */ | ||
122 | #ifdef CONFIG_X86 | ||
123 | /* | ||
124 | * If we're the non-boot CPU, nothing set the stack canary up | ||
125 | * for us. The boot CPU already has it initialized but no harm | ||
126 | * in doing it again. This is a good place for updating it, as | ||
127 | * we wont ever return from this function (so the invalid | ||
128 | * canaries already on the stack wont ever trigger). | ||
129 | */ | ||
130 | boot_init_stack_canary(); | ||
131 | #endif | ||
115 | current_set_polling(); | 132 | current_set_polling(); |
116 | arch_cpu_idle_prepare(); | 133 | arch_cpu_idle_prepare(); |
117 | cpu_idle_loop(); | 134 | cpu_idle_loop(); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..902d13fc2b13 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
540 | * This function builds a partial partition of the systems CPUs | 540 | * This function builds a partial partition of the systems CPUs |
541 | * A 'partial partition' is a set of non-overlapping subsets whose | 541 | * A 'partial partition' is a set of non-overlapping subsets whose |
542 | * union is a subset of that set. | 542 | * union is a subset of that set. |
543 | * The output of this function needs to be passed to kernel/sched.c | 543 | * The output of this function needs to be passed to kernel/sched/core.c |
544 | * partition_sched_domains() routine, which will rebuild the scheduler's | 544 | * partition_sched_domains() routine, which will rebuild the scheduler's |
545 | * load balancing domains (sched domains) as specified by that partial | 545 | * load balancing domains (sched domains) as specified by that partial |
546 | * partition. | 546 | * partition. |
@@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
569 | * is a subset of one of these domains, while there are as | 569 | * is a subset of one of these domains, while there are as |
570 | * many such domains as possible, each as small as possible. | 570 | * many such domains as possible, each as small as possible. |
571 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to | 571 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to |
572 | * the kernel/sched.c routine partition_sched_domains() in a | 572 | * the kernel/sched/core.c routine partition_sched_domains() in a |
573 | * convenient format, that can be easily compared to the prior | 573 | * convenient format, that can be easily compared to the prior |
574 | * value to determine what partition elements (sched domains) | 574 | * value to determine what partition elements (sched domains) |
575 | * were changed (added or removed.) | 575 | * were changed (added or removed.) |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..1db3af933704 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' | |||
165 | /* | 165 | /* |
166 | * max perf event sample rate | 166 | * max perf event sample rate |
167 | */ | 167 | */ |
168 | #define DEFAULT_MAX_SAMPLE_RATE 100000 | 168 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
169 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | 169 | #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) |
170 | static int max_samples_per_tick __read_mostly = | 170 | #define DEFAULT_CPU_TIME_MAX_PERCENT 25 |
171 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 171 | |
172 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
173 | |||
174 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
175 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | ||
176 | |||
177 | static atomic_t perf_sample_allowed_ns __read_mostly = | ||
178 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | ||
179 | |||
180 | void update_perf_cpu_limits(void) | ||
181 | { | ||
182 | u64 tmp = perf_sample_period_ns; | ||
183 | |||
184 | tmp *= sysctl_perf_cpu_time_max_percent; | ||
185 | tmp = do_div(tmp, 100); | ||
186 | atomic_set(&perf_sample_allowed_ns, tmp); | ||
187 | } | ||
188 | |||
189 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | ||
172 | 190 | ||
173 | int perf_proc_update_handler(struct ctl_table *table, int write, | 191 | int perf_proc_update_handler(struct ctl_table *table, int write, |
174 | void __user *buffer, size_t *lenp, | 192 | void __user *buffer, size_t *lenp, |
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
180 | return ret; | 198 | return ret; |
181 | 199 | ||
182 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | 200 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); |
201 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | ||
202 | update_perf_cpu_limits(); | ||
183 | 203 | ||
184 | return 0; | 204 | return 0; |
185 | } | 205 | } |
186 | 206 | ||
207 | int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; | ||
208 | |||
209 | int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | ||
210 | void __user *buffer, size_t *lenp, | ||
211 | loff_t *ppos) | ||
212 | { | ||
213 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
214 | |||
215 | if (ret || !write) | ||
216 | return ret; | ||
217 | |||
218 | update_perf_cpu_limits(); | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | /* | ||
224 | * perf samples are done in some very critical code paths (NMIs). | ||
225 | * If they take too much CPU time, the system can lock up and not | ||
226 | * get any real work done. This will drop the sample rate when | ||
227 | * we detect that events are taking too long. | ||
228 | */ | ||
229 | #define NR_ACCUMULATED_SAMPLES 128 | ||
230 | DEFINE_PER_CPU(u64, running_sample_length); | ||
231 | |||
232 | void perf_sample_event_took(u64 sample_len_ns) | ||
233 | { | ||
234 | u64 avg_local_sample_len; | ||
235 | u64 local_samples_len = __get_cpu_var(running_sample_length); | ||
236 | |||
237 | if (atomic_read(&perf_sample_allowed_ns) == 0) | ||
238 | return; | ||
239 | |||
240 | /* decay the counter by 1 average sample */ | ||
241 | local_samples_len = __get_cpu_var(running_sample_length); | ||
242 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; | ||
243 | local_samples_len += sample_len_ns; | ||
244 | __get_cpu_var(running_sample_length) = local_samples_len; | ||
245 | |||
246 | /* | ||
247 | * note: this will be biased artifically low until we have | ||
248 | * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us | ||
249 | * from having to maintain a count. | ||
250 | */ | ||
251 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | ||
252 | |||
253 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | ||
254 | return; | ||
255 | |||
256 | if (max_samples_per_tick <= 1) | ||
257 | return; | ||
258 | |||
259 | max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); | ||
260 | sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; | ||
261 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | ||
262 | |||
263 | printk_ratelimited(KERN_WARNING | ||
264 | "perf samples too long (%lld > %d), lowering " | ||
265 | "kernel.perf_event_max_sample_rate to %d\n", | ||
266 | avg_local_sample_len, | ||
267 | atomic_read(&perf_sample_allowed_ns), | ||
268 | sysctl_perf_event_sample_rate); | ||
269 | |||
270 | update_perf_cpu_limits(); | ||
271 | } | ||
272 | |||
187 | static atomic64_t perf_event_id; | 273 | static atomic64_t perf_event_id; |
188 | 274 | ||
189 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 275 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
196 | static void update_context_time(struct perf_event_context *ctx); | 282 | static void update_context_time(struct perf_event_context *ctx); |
197 | static u64 perf_event_time(struct perf_event *event); | 283 | static u64 perf_event_time(struct perf_event *event); |
198 | 284 | ||
199 | static void ring_buffer_attach(struct perf_event *event, | ||
200 | struct ring_buffer *rb); | ||
201 | |||
202 | void __weak perf_event_print_debug(void) { } | 285 | void __weak perf_event_print_debug(void) { } |
203 | 286 | ||
204 | extern __weak const char *perf_pmu_name(void) | 287 | extern __weak const char *perf_pmu_name(void) |
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event, | |||
658 | } | 741 | } |
659 | #endif | 742 | #endif |
660 | 743 | ||
744 | /* | ||
745 | * set default to be dependent on timer tick just | ||
746 | * like original code | ||
747 | */ | ||
748 | #define PERF_CPU_HRTIMER (1000 / HZ) | ||
749 | /* | ||
750 | * function must be called with interrupts disbled | ||
751 | */ | ||
752 | static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) | ||
753 | { | ||
754 | struct perf_cpu_context *cpuctx; | ||
755 | enum hrtimer_restart ret = HRTIMER_NORESTART; | ||
756 | int rotations = 0; | ||
757 | |||
758 | WARN_ON(!irqs_disabled()); | ||
759 | |||
760 | cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); | ||
761 | |||
762 | rotations = perf_rotate_context(cpuctx); | ||
763 | |||
764 | /* | ||
765 | * arm timer if needed | ||
766 | */ | ||
767 | if (rotations) { | ||
768 | hrtimer_forward_now(hr, cpuctx->hrtimer_interval); | ||
769 | ret = HRTIMER_RESTART; | ||
770 | } | ||
771 | |||
772 | return ret; | ||
773 | } | ||
774 | |||
775 | /* CPU is going down */ | ||
776 | void perf_cpu_hrtimer_cancel(int cpu) | ||
777 | { | ||
778 | struct perf_cpu_context *cpuctx; | ||
779 | struct pmu *pmu; | ||
780 | unsigned long flags; | ||
781 | |||
782 | if (WARN_ON(cpu != smp_processor_id())) | ||
783 | return; | ||
784 | |||
785 | local_irq_save(flags); | ||
786 | |||
787 | rcu_read_lock(); | ||
788 | |||
789 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
790 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
791 | |||
792 | if (pmu->task_ctx_nr == perf_sw_context) | ||
793 | continue; | ||
794 | |||
795 | hrtimer_cancel(&cpuctx->hrtimer); | ||
796 | } | ||
797 | |||
798 | rcu_read_unlock(); | ||
799 | |||
800 | local_irq_restore(flags); | ||
801 | } | ||
802 | |||
803 | static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) | ||
804 | { | ||
805 | struct hrtimer *hr = &cpuctx->hrtimer; | ||
806 | struct pmu *pmu = cpuctx->ctx.pmu; | ||
807 | int timer; | ||
808 | |||
809 | /* no multiplexing needed for SW PMU */ | ||
810 | if (pmu->task_ctx_nr == perf_sw_context) | ||
811 | return; | ||
812 | |||
813 | /* | ||
814 | * check default is sane, if not set then force to | ||
815 | * default interval (1/tick) | ||
816 | */ | ||
817 | timer = pmu->hrtimer_interval_ms; | ||
818 | if (timer < 1) | ||
819 | timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; | ||
820 | |||
821 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); | ||
822 | |||
823 | hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); | ||
824 | hr->function = perf_cpu_hrtimer_handler; | ||
825 | } | ||
826 | |||
827 | static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) | ||
828 | { | ||
829 | struct hrtimer *hr = &cpuctx->hrtimer; | ||
830 | struct pmu *pmu = cpuctx->ctx.pmu; | ||
831 | |||
832 | /* not for SW PMU */ | ||
833 | if (pmu->task_ctx_nr == perf_sw_context) | ||
834 | return; | ||
835 | |||
836 | if (hrtimer_active(hr)) | ||
837 | return; | ||
838 | |||
839 | if (!hrtimer_callback_running(hr)) | ||
840 | __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, | ||
841 | 0, HRTIMER_MODE_REL_PINNED, 0); | ||
842 | } | ||
843 | |||
661 | void perf_pmu_disable(struct pmu *pmu) | 844 | void perf_pmu_disable(struct pmu *pmu) |
662 | { | 845 | { |
663 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 846 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event, | |||
1506 | 1689 | ||
1507 | if (event_sched_in(group_event, cpuctx, ctx)) { | 1690 | if (event_sched_in(group_event, cpuctx, ctx)) { |
1508 | pmu->cancel_txn(pmu); | 1691 | pmu->cancel_txn(pmu); |
1692 | perf_cpu_hrtimer_restart(cpuctx); | ||
1509 | return -EAGAIN; | 1693 | return -EAGAIN; |
1510 | } | 1694 | } |
1511 | 1695 | ||
@@ -1552,6 +1736,8 @@ group_error: | |||
1552 | 1736 | ||
1553 | pmu->cancel_txn(pmu); | 1737 | pmu->cancel_txn(pmu); |
1554 | 1738 | ||
1739 | perf_cpu_hrtimer_restart(cpuctx); | ||
1740 | |||
1555 | return -EAGAIN; | 1741 | return -EAGAIN; |
1556 | } | 1742 | } |
1557 | 1743 | ||
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info) | |||
1807 | * If this event can't go on and it's part of a | 1993 | * If this event can't go on and it's part of a |
1808 | * group, then the whole group has to come off. | 1994 | * group, then the whole group has to come off. |
1809 | */ | 1995 | */ |
1810 | if (leader != event) | 1996 | if (leader != event) { |
1811 | group_sched_out(leader, cpuctx, ctx); | 1997 | group_sched_out(leader, cpuctx, ctx); |
1998 | perf_cpu_hrtimer_restart(cpuctx); | ||
1999 | } | ||
1812 | if (leader->attr.pinned) { | 2000 | if (leader->attr.pinned) { |
1813 | update_group_times(leader); | 2001 | update_group_times(leader); |
1814 | leader->state = PERF_EVENT_STATE_ERROR; | 2002 | leader->state = PERF_EVENT_STATE_ERROR; |
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
2555 | * because they're strictly cpu affine and rotate_start is called with IRQs | 2743 | * because they're strictly cpu affine and rotate_start is called with IRQs |
2556 | * disabled, while rotate_context is called from IRQ context. | 2744 | * disabled, while rotate_context is called from IRQ context. |
2557 | */ | 2745 | */ |
2558 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | 2746 | static int perf_rotate_context(struct perf_cpu_context *cpuctx) |
2559 | { | 2747 | { |
2560 | struct perf_event_context *ctx = NULL; | 2748 | struct perf_event_context *ctx = NULL; |
2561 | int rotate = 0, remove = 1; | 2749 | int rotate = 0, remove = 1; |
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2594 | done: | 2782 | done: |
2595 | if (remove) | 2783 | if (remove) |
2596 | list_del_init(&cpuctx->rotation_list); | 2784 | list_del_init(&cpuctx->rotation_list); |
2785 | |||
2786 | return rotate; | ||
2597 | } | 2787 | } |
2598 | 2788 | ||
2599 | #ifdef CONFIG_NO_HZ_FULL | 2789 | #ifdef CONFIG_NO_HZ_FULL |
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void) | |||
2625 | ctx = cpuctx->task_ctx; | 2815 | ctx = cpuctx->task_ctx; |
2626 | if (ctx) | 2816 | if (ctx) |
2627 | perf_adjust_freq_unthr_context(ctx, throttled); | 2817 | perf_adjust_freq_unthr_context(ctx, throttled); |
2628 | |||
2629 | if (cpuctx->jiffies_interval == 1 || | ||
2630 | !(jiffies % cpuctx->jiffies_interval)) | ||
2631 | perf_rotate_context(cpuctx); | ||
2632 | } | 2818 | } |
2633 | } | 2819 | } |
2634 | 2820 | ||
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
2918 | } | 3104 | } |
2919 | 3105 | ||
2920 | static void ring_buffer_put(struct ring_buffer *rb); | 3106 | static void ring_buffer_put(struct ring_buffer *rb); |
3107 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | ||
2921 | 3108 | ||
2922 | static void free_event(struct perf_event *event) | 3109 | static void free_event(struct perf_event *event) |
2923 | { | 3110 | { |
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event) | |||
2942 | if (has_branch_stack(event)) { | 3129 | if (has_branch_stack(event)) { |
2943 | static_key_slow_dec_deferred(&perf_sched_events); | 3130 | static_key_slow_dec_deferred(&perf_sched_events); |
2944 | /* is system-wide event */ | 3131 | /* is system-wide event */ |
2945 | if (!(event->attach_state & PERF_ATTACH_TASK)) | 3132 | if (!(event->attach_state & PERF_ATTACH_TASK)) { |
2946 | atomic_dec(&per_cpu(perf_branch_stack_events, | 3133 | atomic_dec(&per_cpu(perf_branch_stack_events, |
2947 | event->cpu)); | 3134 | event->cpu)); |
3135 | } | ||
2948 | } | 3136 | } |
2949 | } | 3137 | } |
2950 | 3138 | ||
2951 | if (event->rb) { | 3139 | if (event->rb) { |
2952 | ring_buffer_put(event->rb); | 3140 | struct ring_buffer *rb; |
2953 | event->rb = NULL; | 3141 | |
3142 | /* | ||
3143 | * Can happen when we close an event with re-directed output. | ||
3144 | * | ||
3145 | * Since we have a 0 refcount, perf_mmap_close() will skip | ||
3146 | * over us; possibly making our ring_buffer_put() the last. | ||
3147 | */ | ||
3148 | mutex_lock(&event->mmap_mutex); | ||
3149 | rb = event->rb; | ||
3150 | if (rb) { | ||
3151 | rcu_assign_pointer(event->rb, NULL); | ||
3152 | ring_buffer_detach(event, rb); | ||
3153 | ring_buffer_put(rb); /* could be last */ | ||
3154 | } | ||
3155 | mutex_unlock(&event->mmap_mutex); | ||
2954 | } | 3156 | } |
2955 | 3157 | ||
2956 | if (is_cgroup_event(event)) | 3158 | if (is_cgroup_event(event)) |
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3188 | unsigned int events = POLL_HUP; | 3390 | unsigned int events = POLL_HUP; |
3189 | 3391 | ||
3190 | /* | 3392 | /* |
3191 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | 3393 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
3192 | * grabs the rb reference but perf_event_set_output() overrides it. | 3394 | * perf_event_set_output() can swizzle our rb and make us miss wakeups. |
3193 | * Here is the timeline for two threads T1, T2: | ||
3194 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3195 | * t1: T2, old_rb = event->rb | ||
3196 | * t2: T2, event->rb = new rb | ||
3197 | * t3: T2, ring_buffer_detach(old_rb) | ||
3198 | * t4: T1, ring_buffer_attach(rb1) | ||
3199 | * t5: T1, poll_wait(event->waitq) | ||
3200 | * | ||
3201 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3202 | * thereby ensuring that the assignment of the new ring buffer | ||
3203 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3204 | */ | 3395 | */ |
3205 | mutex_lock(&event->mmap_mutex); | 3396 | mutex_lock(&event->mmap_mutex); |
3206 | 3397 | rb = event->rb; | |
3207 | rcu_read_lock(); | 3398 | if (rb) |
3208 | rb = rcu_dereference(event->rb); | ||
3209 | if (rb) { | ||
3210 | ring_buffer_attach(event, rb); | ||
3211 | events = atomic_xchg(&rb->poll, 0); | 3399 | events = atomic_xchg(&rb->poll, 0); |
3212 | } | ||
3213 | rcu_read_unlock(); | ||
3214 | |||
3215 | mutex_unlock(&event->mmap_mutex); | 3400 | mutex_unlock(&event->mmap_mutex); |
3216 | 3401 | ||
3217 | poll_wait(file, &event->waitq, wait); | 3402 | poll_wait(file, &event->waitq, wait); |
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event, | |||
3521 | return; | 3706 | return; |
3522 | 3707 | ||
3523 | spin_lock_irqsave(&rb->event_lock, flags); | 3708 | spin_lock_irqsave(&rb->event_lock, flags); |
3524 | if (!list_empty(&event->rb_entry)) | 3709 | if (list_empty(&event->rb_entry)) |
3525 | goto unlock; | 3710 | list_add(&event->rb_entry, &rb->event_list); |
3526 | |||
3527 | list_add(&event->rb_entry, &rb->event_list); | ||
3528 | unlock: | ||
3529 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3711 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3530 | } | 3712 | } |
3531 | 3713 | ||
3532 | static void ring_buffer_detach(struct perf_event *event, | 3714 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) |
3533 | struct ring_buffer *rb) | ||
3534 | { | 3715 | { |
3535 | unsigned long flags; | 3716 | unsigned long flags; |
3536 | 3717 | ||
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event) | |||
3549 | 3730 | ||
3550 | rcu_read_lock(); | 3731 | rcu_read_lock(); |
3551 | rb = rcu_dereference(event->rb); | 3732 | rb = rcu_dereference(event->rb); |
3552 | if (!rb) | 3733 | if (rb) { |
3553 | goto unlock; | 3734 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) |
3554 | 3735 | wake_up_all(&event->waitq); | |
3555 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | 3736 | } |
3556 | wake_up_all(&event->waitq); | ||
3557 | |||
3558 | unlock: | ||
3559 | rcu_read_unlock(); | 3737 | rcu_read_unlock(); |
3560 | } | 3738 | } |
3561 | 3739 | ||
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3584 | 3762 | ||
3585 | static void ring_buffer_put(struct ring_buffer *rb) | 3763 | static void ring_buffer_put(struct ring_buffer *rb) |
3586 | { | 3764 | { |
3587 | struct perf_event *event, *n; | ||
3588 | unsigned long flags; | ||
3589 | |||
3590 | if (!atomic_dec_and_test(&rb->refcount)) | 3765 | if (!atomic_dec_and_test(&rb->refcount)) |
3591 | return; | 3766 | return; |
3592 | 3767 | ||
3593 | spin_lock_irqsave(&rb->event_lock, flags); | 3768 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
3594 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3595 | list_del_init(&event->rb_entry); | ||
3596 | wake_up_all(&event->waitq); | ||
3597 | } | ||
3598 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3599 | 3769 | ||
3600 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3770 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3601 | } | 3771 | } |
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
3605 | struct perf_event *event = vma->vm_file->private_data; | 3775 | struct perf_event *event = vma->vm_file->private_data; |
3606 | 3776 | ||
3607 | atomic_inc(&event->mmap_count); | 3777 | atomic_inc(&event->mmap_count); |
3778 | atomic_inc(&event->rb->mmap_count); | ||
3608 | } | 3779 | } |
3609 | 3780 | ||
3781 | /* | ||
3782 | * A buffer can be mmap()ed multiple times; either directly through the same | ||
3783 | * event, or through other events by use of perf_event_set_output(). | ||
3784 | * | ||
3785 | * In order to undo the VM accounting done by perf_mmap() we need to destroy | ||
3786 | * the buffer here, where we still have a VM context. This means we need | ||
3787 | * to detach all events redirecting to us. | ||
3788 | */ | ||
3610 | static void perf_mmap_close(struct vm_area_struct *vma) | 3789 | static void perf_mmap_close(struct vm_area_struct *vma) |
3611 | { | 3790 | { |
3612 | struct perf_event *event = vma->vm_file->private_data; | 3791 | struct perf_event *event = vma->vm_file->private_data; |
3613 | 3792 | ||
3614 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3793 | struct ring_buffer *rb = event->rb; |
3615 | unsigned long size = perf_data_size(event->rb); | 3794 | struct user_struct *mmap_user = rb->mmap_user; |
3616 | struct user_struct *user = event->mmap_user; | 3795 | int mmap_locked = rb->mmap_locked; |
3617 | struct ring_buffer *rb = event->rb; | 3796 | unsigned long size = perf_data_size(rb); |
3797 | |||
3798 | atomic_dec(&rb->mmap_count); | ||
3799 | |||
3800 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | ||
3801 | return; | ||
3618 | 3802 | ||
3619 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3803 | /* Detach current event from the buffer. */ |
3620 | vma->vm_mm->pinned_vm -= event->mmap_locked; | 3804 | rcu_assign_pointer(event->rb, NULL); |
3621 | rcu_assign_pointer(event->rb, NULL); | 3805 | ring_buffer_detach(event, rb); |
3622 | ring_buffer_detach(event, rb); | 3806 | mutex_unlock(&event->mmap_mutex); |
3807 | |||
3808 | /* If there's still other mmap()s of this buffer, we're done. */ | ||
3809 | if (atomic_read(&rb->mmap_count)) { | ||
3810 | ring_buffer_put(rb); /* can't be last */ | ||
3811 | return; | ||
3812 | } | ||
3813 | |||
3814 | /* | ||
3815 | * No other mmap()s, detach from all other events that might redirect | ||
3816 | * into the now unreachable buffer. Somewhat complicated by the | ||
3817 | * fact that rb::event_lock otherwise nests inside mmap_mutex. | ||
3818 | */ | ||
3819 | again: | ||
3820 | rcu_read_lock(); | ||
3821 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { | ||
3822 | if (!atomic_long_inc_not_zero(&event->refcount)) { | ||
3823 | /* | ||
3824 | * This event is en-route to free_event() which will | ||
3825 | * detach it and remove it from the list. | ||
3826 | */ | ||
3827 | continue; | ||
3828 | } | ||
3829 | rcu_read_unlock(); | ||
3830 | |||
3831 | mutex_lock(&event->mmap_mutex); | ||
3832 | /* | ||
3833 | * Check we didn't race with perf_event_set_output() which can | ||
3834 | * swizzle the rb from under us while we were waiting to | ||
3835 | * acquire mmap_mutex. | ||
3836 | * | ||
3837 | * If we find a different rb; ignore this event, a next | ||
3838 | * iteration will no longer find it on the list. We have to | ||
3839 | * still restart the iteration to make sure we're not now | ||
3840 | * iterating the wrong list. | ||
3841 | */ | ||
3842 | if (event->rb == rb) { | ||
3843 | rcu_assign_pointer(event->rb, NULL); | ||
3844 | ring_buffer_detach(event, rb); | ||
3845 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3846 | } | ||
3623 | mutex_unlock(&event->mmap_mutex); | 3847 | mutex_unlock(&event->mmap_mutex); |
3848 | put_event(event); | ||
3624 | 3849 | ||
3625 | ring_buffer_put(rb); | 3850 | /* |
3626 | free_uid(user); | 3851 | * Restart the iteration; either we're on the wrong list or |
3852 | * destroyed its integrity by doing a deletion. | ||
3853 | */ | ||
3854 | goto again; | ||
3627 | } | 3855 | } |
3856 | rcu_read_unlock(); | ||
3857 | |||
3858 | /* | ||
3859 | * It could be there's still a few 0-ref events on the list; they'll | ||
3860 | * get cleaned up by free_event() -- they'll also still have their | ||
3861 | * ref on the rb and will free it whenever they are done with it. | ||
3862 | * | ||
3863 | * Aside from that, this buffer is 'fully' detached and unmapped, | ||
3864 | * undo the VM accounting. | ||
3865 | */ | ||
3866 | |||
3867 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | ||
3868 | vma->vm_mm->pinned_vm -= mmap_locked; | ||
3869 | free_uid(mmap_user); | ||
3870 | |||
3871 | ring_buffer_put(rb); /* could be last */ | ||
3628 | } | 3872 | } |
3629 | 3873 | ||
3630 | static const struct vm_operations_struct perf_mmap_vmops = { | 3874 | static const struct vm_operations_struct perf_mmap_vmops = { |
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3674 | return -EINVAL; | 3918 | return -EINVAL; |
3675 | 3919 | ||
3676 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3920 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3921 | again: | ||
3677 | mutex_lock(&event->mmap_mutex); | 3922 | mutex_lock(&event->mmap_mutex); |
3678 | if (event->rb) { | 3923 | if (event->rb) { |
3679 | if (event->rb->nr_pages == nr_pages) | 3924 | if (event->rb->nr_pages != nr_pages) { |
3680 | atomic_inc(&event->rb->refcount); | ||
3681 | else | ||
3682 | ret = -EINVAL; | 3925 | ret = -EINVAL; |
3926 | goto unlock; | ||
3927 | } | ||
3928 | |||
3929 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | ||
3930 | /* | ||
3931 | * Raced against perf_mmap_close() through | ||
3932 | * perf_event_set_output(). Try again, hope for better | ||
3933 | * luck. | ||
3934 | */ | ||
3935 | mutex_unlock(&event->mmap_mutex); | ||
3936 | goto again; | ||
3937 | } | ||
3938 | |||
3683 | goto unlock; | 3939 | goto unlock; |
3684 | } | 3940 | } |
3685 | 3941 | ||
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3720 | ret = -ENOMEM; | 3976 | ret = -ENOMEM; |
3721 | goto unlock; | 3977 | goto unlock; |
3722 | } | 3978 | } |
3723 | rcu_assign_pointer(event->rb, rb); | 3979 | |
3980 | atomic_set(&rb->mmap_count, 1); | ||
3981 | rb->mmap_locked = extra; | ||
3982 | rb->mmap_user = get_current_user(); | ||
3724 | 3983 | ||
3725 | atomic_long_add(user_extra, &user->locked_vm); | 3984 | atomic_long_add(user_extra, &user->locked_vm); |
3726 | event->mmap_locked = extra; | 3985 | vma->vm_mm->pinned_vm += extra; |
3727 | event->mmap_user = get_current_user(); | 3986 | |
3728 | vma->vm_mm->pinned_vm += event->mmap_locked; | 3987 | ring_buffer_attach(event, rb); |
3988 | rcu_assign_pointer(event->rb, rb); | ||
3729 | 3989 | ||
3730 | perf_event_update_userpage(event); | 3990 | perf_event_update_userpage(event); |
3731 | 3991 | ||
@@ -3734,7 +3994,11 @@ unlock: | |||
3734 | atomic_inc(&event->mmap_count); | 3994 | atomic_inc(&event->mmap_count); |
3735 | mutex_unlock(&event->mmap_mutex); | 3995 | mutex_unlock(&event->mmap_mutex); |
3736 | 3996 | ||
3737 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; | 3997 | /* |
3998 | * Since pinned accounting is per vm we cannot allow fork() to copy our | ||
3999 | * vma. | ||
4000 | */ | ||
4001 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; | ||
3738 | vma->vm_ops = &perf_mmap_vmops; | 4002 | vma->vm_ops = &perf_mmap_vmops; |
3739 | 4003 | ||
3740 | return ret; | 4004 | return ret; |
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | |||
4961 | * sign as trigger. | 5225 | * sign as trigger. |
4962 | */ | 5226 | */ |
4963 | 5227 | ||
4964 | static u64 perf_swevent_set_period(struct perf_event *event) | 5228 | u64 perf_swevent_set_period(struct perf_event *event) |
4965 | { | 5229 | { |
4966 | struct hw_perf_event *hwc = &event->hw; | 5230 | struct hw_perf_event *hwc = &event->hw; |
4967 | u64 period = hwc->last_period; | 5231 | u64 period = hwc->last_period; |
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
5904 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6168 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
5905 | } | 6169 | } |
5906 | 6170 | ||
6171 | static ssize_t | ||
6172 | perf_event_mux_interval_ms_show(struct device *dev, | ||
6173 | struct device_attribute *attr, | ||
6174 | char *page) | ||
6175 | { | ||
6176 | struct pmu *pmu = dev_get_drvdata(dev); | ||
6177 | |||
6178 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); | ||
6179 | } | ||
6180 | |||
6181 | static ssize_t | ||
6182 | perf_event_mux_interval_ms_store(struct device *dev, | ||
6183 | struct device_attribute *attr, | ||
6184 | const char *buf, size_t count) | ||
6185 | { | ||
6186 | struct pmu *pmu = dev_get_drvdata(dev); | ||
6187 | int timer, cpu, ret; | ||
6188 | |||
6189 | ret = kstrtoint(buf, 0, &timer); | ||
6190 | if (ret) | ||
6191 | return ret; | ||
6192 | |||
6193 | if (timer < 1) | ||
6194 | return -EINVAL; | ||
6195 | |||
6196 | /* same value, noting to do */ | ||
6197 | if (timer == pmu->hrtimer_interval_ms) | ||
6198 | return count; | ||
6199 | |||
6200 | pmu->hrtimer_interval_ms = timer; | ||
6201 | |||
6202 | /* update all cpuctx for this PMU */ | ||
6203 | for_each_possible_cpu(cpu) { | ||
6204 | struct perf_cpu_context *cpuctx; | ||
6205 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
6206 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); | ||
6207 | |||
6208 | if (hrtimer_active(&cpuctx->hrtimer)) | ||
6209 | hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); | ||
6210 | } | ||
6211 | |||
6212 | return count; | ||
6213 | } | ||
6214 | |||
6215 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) | ||
6216 | |||
5907 | static struct device_attribute pmu_dev_attrs[] = { | 6217 | static struct device_attribute pmu_dev_attrs[] = { |
5908 | __ATTR_RO(type), | 6218 | __ATTR_RO(type), |
5909 | __ATTR_NULL, | 6219 | __ATTR_RW(perf_event_mux_interval_ms), |
6220 | __ATTR_NULL, | ||
5910 | }; | 6221 | }; |
5911 | 6222 | ||
5912 | static int pmu_bus_running; | 6223 | static int pmu_bus_running; |
@@ -5952,7 +6263,7 @@ free_dev: | |||
5952 | static struct lock_class_key cpuctx_mutex; | 6263 | static struct lock_class_key cpuctx_mutex; |
5953 | static struct lock_class_key cpuctx_lock; | 6264 | static struct lock_class_key cpuctx_lock; |
5954 | 6265 | ||
5955 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | 6266 | int perf_pmu_register(struct pmu *pmu, const char *name, int type) |
5956 | { | 6267 | { |
5957 | int cpu, ret; | 6268 | int cpu, ret; |
5958 | 6269 | ||
@@ -6001,7 +6312,9 @@ skip_type: | |||
6001 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | 6312 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); |
6002 | cpuctx->ctx.type = cpu_context; | 6313 | cpuctx->ctx.type = cpu_context; |
6003 | cpuctx->ctx.pmu = pmu; | 6314 | cpuctx->ctx.pmu = pmu; |
6004 | cpuctx->jiffies_interval = 1; | 6315 | |
6316 | __perf_cpu_hrtimer_init(cpuctx, cpu); | ||
6317 | |||
6005 | INIT_LIST_HEAD(&cpuctx->rotation_list); | 6318 | INIT_LIST_HEAD(&cpuctx->rotation_list); |
6006 | cpuctx->unique_pmu = pmu; | 6319 | cpuctx->unique_pmu = pmu; |
6007 | } | 6320 | } |
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6327 | if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) | 6640 | if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) |
6328 | return -EINVAL; | 6641 | return -EINVAL; |
6329 | 6642 | ||
6330 | /* kernel level capture: check permissions */ | ||
6331 | if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) | ||
6332 | && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
6333 | return -EACCES; | ||
6334 | |||
6335 | /* propagate priv level, when not set for branch */ | 6643 | /* propagate priv level, when not set for branch */ |
6336 | if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { | 6644 | if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { |
6337 | 6645 | ||
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6349 | */ | 6657 | */ |
6350 | attr->branch_sample_type = mask; | 6658 | attr->branch_sample_type = mask; |
6351 | } | 6659 | } |
6660 | /* privileged levels capture (kernel, hv): check permissions */ | ||
6661 | if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) | ||
6662 | && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
6663 | return -EACCES; | ||
6352 | } | 6664 | } |
6353 | 6665 | ||
6354 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { | 6666 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { |
@@ -6412,6 +6724,8 @@ set: | |||
6412 | if (atomic_read(&event->mmap_count)) | 6724 | if (atomic_read(&event->mmap_count)) |
6413 | goto unlock; | 6725 | goto unlock; |
6414 | 6726 | ||
6727 | old_rb = event->rb; | ||
6728 | |||
6415 | if (output_event) { | 6729 | if (output_event) { |
6416 | /* get the rb we want to redirect to */ | 6730 | /* get the rb we want to redirect to */ |
6417 | rb = ring_buffer_get(output_event); | 6731 | rb = ring_buffer_get(output_event); |
@@ -6419,16 +6733,28 @@ set: | |||
6419 | goto unlock; | 6733 | goto unlock; |
6420 | } | 6734 | } |
6421 | 6735 | ||
6422 | old_rb = event->rb; | ||
6423 | rcu_assign_pointer(event->rb, rb); | ||
6424 | if (old_rb) | 6736 | if (old_rb) |
6425 | ring_buffer_detach(event, old_rb); | 6737 | ring_buffer_detach(event, old_rb); |
6738 | |||
6739 | if (rb) | ||
6740 | ring_buffer_attach(event, rb); | ||
6741 | |||
6742 | rcu_assign_pointer(event->rb, rb); | ||
6743 | |||
6744 | if (old_rb) { | ||
6745 | ring_buffer_put(old_rb); | ||
6746 | /* | ||
6747 | * Since we detached before setting the new rb, so that we | ||
6748 | * could attach the new rb, we could have missed a wakeup. | ||
6749 | * Provide it now. | ||
6750 | */ | ||
6751 | wake_up_all(&event->waitq); | ||
6752 | } | ||
6753 | |||
6426 | ret = 0; | 6754 | ret = 0; |
6427 | unlock: | 6755 | unlock: |
6428 | mutex_unlock(&event->mmap_mutex); | 6756 | mutex_unlock(&event->mmap_mutex); |
6429 | 6757 | ||
6430 | if (old_rb) | ||
6431 | ring_buffer_put(old_rb); | ||
6432 | out: | 6758 | out: |
6433 | return ret; | 6759 | return ret; |
6434 | } | 6760 | } |
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
7387 | case CPU_DOWN_PREPARE: | 7713 | case CPU_DOWN_PREPARE: |
7388 | perf_event_exit_cpu(cpu); | 7714 | perf_event_exit_cpu(cpu); |
7389 | break; | 7715 | break; |
7390 | |||
7391 | default: | 7716 | default: |
7392 | break; | 7717 | break; |
7393 | } | 7718 | } |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..1559fb0b9296 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -46,23 +46,26 @@ | |||
46 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
47 | 47 | ||
48 | #include <linux/hw_breakpoint.h> | 48 | #include <linux/hw_breakpoint.h> |
49 | |||
50 | |||
51 | /* | 49 | /* |
52 | * Constraints data | 50 | * Constraints data |
53 | */ | 51 | */ |
52 | struct bp_cpuinfo { | ||
53 | /* Number of pinned cpu breakpoints in a cpu */ | ||
54 | unsigned int cpu_pinned; | ||
55 | /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ | ||
56 | unsigned int *tsk_pinned; | ||
57 | /* Number of non-pinned cpu/task breakpoints in a cpu */ | ||
58 | unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ | ||
59 | }; | ||
54 | 60 | ||
55 | /* Number of pinned cpu breakpoints in a cpu */ | 61 | static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); |
56 | static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); | ||
57 | |||
58 | /* Number of pinned task breakpoints in a cpu */ | ||
59 | static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); | ||
60 | |||
61 | /* Number of non-pinned cpu/task breakpoints in a cpu */ | ||
62 | static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); | ||
63 | |||
64 | static int nr_slots[TYPE_MAX]; | 62 | static int nr_slots[TYPE_MAX]; |
65 | 63 | ||
64 | static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) | ||
65 | { | ||
66 | return per_cpu_ptr(bp_cpuinfo + type, cpu); | ||
67 | } | ||
68 | |||
66 | /* Keep track of the breakpoints attached to tasks */ | 69 | /* Keep track of the breakpoints attached to tasks */ |
67 | static LIST_HEAD(bp_task_head); | 70 | static LIST_HEAD(bp_task_head); |
68 | 71 | ||
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) | |||
96 | */ | 99 | */ |
97 | static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | 100 | static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) |
98 | { | 101 | { |
102 | unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; | ||
99 | int i; | 103 | int i; |
100 | unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); | ||
101 | 104 | ||
102 | for (i = nr_slots[type] - 1; i >= 0; i--) { | 105 | for (i = nr_slots[type] - 1; i >= 0; i--) { |
103 | if (tsk_pinned[i] > 0) | 106 | if (tsk_pinned[i] > 0) |
@@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | |||
120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
121 | if (iter->hw.bp_target == tsk && | 124 | if (iter->hw.bp_target == tsk && |
122 | find_slot_idx(iter) == type && | 125 | find_slot_idx(iter) == type && |
123 | cpu == iter->cpu) | 126 | (iter->cpu < 0 || cpu == iter->cpu)) |
124 | count += hw_breakpoint_weight(iter); | 127 | count += hw_breakpoint_weight(iter); |
125 | } | 128 | } |
126 | 129 | ||
127 | return count; | 130 | return count; |
128 | } | 131 | } |
129 | 132 | ||
133 | static const struct cpumask *cpumask_of_bp(struct perf_event *bp) | ||
134 | { | ||
135 | if (bp->cpu >= 0) | ||
136 | return cpumask_of(bp->cpu); | ||
137 | return cpu_possible_mask; | ||
138 | } | ||
139 | |||
130 | /* | 140 | /* |
131 | * Report the number of pinned/un-pinned breakpoints we have in | 141 | * Report the number of pinned/un-pinned breakpoints we have in |
132 | * a given cpu (cpu > -1) or in all of them (cpu = -1). | 142 | * a given cpu (cpu > -1) or in all of them (cpu = -1). |
@@ -135,25 +145,15 @@ static void | |||
135 | fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | 145 | fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, |
136 | enum bp_type_idx type) | 146 | enum bp_type_idx type) |
137 | { | 147 | { |
138 | int cpu = bp->cpu; | 148 | const struct cpumask *cpumask = cpumask_of_bp(bp); |
139 | struct task_struct *tsk = bp->hw.bp_target; | 149 | int cpu; |
140 | |||
141 | if (cpu >= 0) { | ||
142 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); | ||
143 | if (!tsk) | ||
144 | slots->pinned += max_task_bp_pinned(cpu, type); | ||
145 | else | ||
146 | slots->pinned += task_bp_pinned(cpu, bp, type); | ||
147 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | ||
148 | |||
149 | return; | ||
150 | } | ||
151 | 150 | ||
152 | for_each_online_cpu(cpu) { | 151 | for_each_cpu(cpu, cpumask) { |
153 | unsigned int nr; | 152 | struct bp_cpuinfo *info = get_bp_info(cpu, type); |
153 | int nr; | ||
154 | 154 | ||
155 | nr = per_cpu(nr_cpu_bp_pinned[type], cpu); | 155 | nr = info->cpu_pinned; |
156 | if (!tsk) | 156 | if (!bp->hw.bp_target) |
157 | nr += max_task_bp_pinned(cpu, type); | 157 | nr += max_task_bp_pinned(cpu, type); |
158 | else | 158 | else |
159 | nr += task_bp_pinned(cpu, bp, type); | 159 | nr += task_bp_pinned(cpu, bp, type); |
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
161 | if (nr > slots->pinned) | 161 | if (nr > slots->pinned) |
162 | slots->pinned = nr; | 162 | slots->pinned = nr; |
163 | 163 | ||
164 | nr = per_cpu(nr_bp_flexible[type], cpu); | 164 | nr = info->flexible; |
165 | |||
166 | if (nr > slots->flexible) | 165 | if (nr > slots->flexible) |
167 | slots->flexible = nr; | 166 | slots->flexible = nr; |
168 | } | 167 | } |
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) | |||
182 | /* | 181 | /* |
183 | * Add a pinned breakpoint for the given task in our constraint table | 182 | * Add a pinned breakpoint for the given task in our constraint table |
184 | */ | 183 | */ |
185 | static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, | 184 | static void toggle_bp_task_slot(struct perf_event *bp, int cpu, |
186 | enum bp_type_idx type, int weight) | 185 | enum bp_type_idx type, int weight) |
187 | { | 186 | { |
188 | unsigned int *tsk_pinned; | 187 | unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; |
189 | int old_count = 0; | 188 | int old_idx, new_idx; |
190 | int old_idx = 0; | 189 | |
191 | int idx = 0; | 190 | old_idx = task_bp_pinned(cpu, bp, type) - 1; |
192 | 191 | new_idx = old_idx + weight; | |
193 | old_count = task_bp_pinned(cpu, bp, type); | 192 | |
194 | old_idx = old_count - 1; | 193 | if (old_idx >= 0) |
195 | idx = old_idx + weight; | 194 | tsk_pinned[old_idx]--; |
196 | 195 | if (new_idx >= 0) | |
197 | /* tsk_pinned[n] is the number of tasks having n breakpoints */ | 196 | tsk_pinned[new_idx]++; |
198 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); | ||
199 | if (enable) { | ||
200 | tsk_pinned[idx]++; | ||
201 | if (old_count > 0) | ||
202 | tsk_pinned[old_idx]--; | ||
203 | } else { | ||
204 | tsk_pinned[idx]--; | ||
205 | if (old_count > 0) | ||
206 | tsk_pinned[old_idx]++; | ||
207 | } | ||
208 | } | 197 | } |
209 | 198 | ||
210 | /* | 199 | /* |
@@ -214,33 +203,26 @@ static void | |||
214 | toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | 203 | toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, |
215 | int weight) | 204 | int weight) |
216 | { | 205 | { |
217 | int cpu = bp->cpu; | 206 | const struct cpumask *cpumask = cpumask_of_bp(bp); |
218 | struct task_struct *tsk = bp->hw.bp_target; | 207 | int cpu; |
219 | 208 | ||
220 | /* Pinned counter cpu profiling */ | 209 | if (!enable) |
221 | if (!tsk) { | 210 | weight = -weight; |
222 | 211 | ||
223 | if (enable) | 212 | /* Pinned counter cpu profiling */ |
224 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | 213 | if (!bp->hw.bp_target) { |
225 | else | 214 | get_bp_info(bp->cpu, type)->cpu_pinned += weight; |
226 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | ||
227 | return; | 215 | return; |
228 | } | 216 | } |
229 | 217 | ||
230 | /* Pinned counter task profiling */ | 218 | /* Pinned counter task profiling */ |
231 | 219 | for_each_cpu(cpu, cpumask) | |
232 | if (!enable) | 220 | toggle_bp_task_slot(bp, cpu, type, weight); |
233 | list_del(&bp->hw.bp_list); | ||
234 | |||
235 | if (cpu >= 0) { | ||
236 | toggle_bp_task_slot(bp, cpu, enable, type, weight); | ||
237 | } else { | ||
238 | for_each_online_cpu(cpu) | ||
239 | toggle_bp_task_slot(bp, cpu, enable, type, weight); | ||
240 | } | ||
241 | 221 | ||
242 | if (enable) | 222 | if (enable) |
243 | list_add_tail(&bp->hw.bp_list, &bp_task_head); | 223 | list_add_tail(&bp->hw.bp_list, &bp_task_head); |
224 | else | ||
225 | list_del(&bp->hw.bp_list); | ||
244 | } | 226 | } |
245 | 227 | ||
246 | /* | 228 | /* |
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | |||
261 | * | 243 | * |
262 | * - If attached to a single cpu, check: | 244 | * - If attached to a single cpu, check: |
263 | * | 245 | * |
264 | * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) | 246 | * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) |
265 | * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM | 247 | * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM |
266 | * | 248 | * |
267 | * -> If there are already non-pinned counters in this cpu, it means | 249 | * -> If there are already non-pinned counters in this cpu, it means |
268 | * there is already a free slot for them. | 250 | * there is already a free slot for them. |
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | |||
272 | * | 254 | * |
273 | * - If attached to every cpus, check: | 255 | * - If attached to every cpus, check: |
274 | * | 256 | * |
275 | * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) | 257 | * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) |
276 | * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM | 258 | * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM |
277 | * | 259 | * |
278 | * -> This is roughly the same, except we check the number of per cpu | 260 | * -> This is roughly the same, except we check the number of per cpu |
279 | * bp for every cpu and we keep the max one. Same for the per tasks | 261 | * bp for every cpu and we keep the max one. Same for the per tasks |
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | |||
284 | * | 266 | * |
285 | * - If attached to a single cpu, check: | 267 | * - If attached to a single cpu, check: |
286 | * | 268 | * |
287 | * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) | 269 | * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) |
288 | * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM | 270 | * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM |
289 | * | 271 | * |
290 | * -> Same checks as before. But now the nr_bp_flexible, if any, must keep | 272 | * -> Same checks as before. But now the info->flexible, if any, must keep |
291 | * one register at least (or they will never be fed). | 273 | * one register at least (or they will never be fed). |
292 | * | 274 | * |
293 | * - If attached to every cpus, check: | 275 | * - If attached to every cpus, check: |
294 | * | 276 | * |
295 | * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) | 277 | * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) |
296 | * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM | 278 | * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM |
297 | */ | 279 | */ |
298 | static int __reserve_bp_slot(struct perf_event *bp) | 280 | static int __reserve_bp_slot(struct perf_event *bp) |
299 | { | 281 | { |
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
518 | perf_overflow_handler_t triggered, | 500 | perf_overflow_handler_t triggered, |
519 | void *context) | 501 | void *context) |
520 | { | 502 | { |
521 | struct perf_event * __percpu *cpu_events, **pevent, *bp; | 503 | struct perf_event * __percpu *cpu_events, *bp; |
522 | long err; | 504 | long err = 0; |
523 | int cpu; | 505 | int cpu; |
524 | 506 | ||
525 | cpu_events = alloc_percpu(typeof(*cpu_events)); | 507 | cpu_events = alloc_percpu(typeof(*cpu_events)); |
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
528 | 510 | ||
529 | get_online_cpus(); | 511 | get_online_cpus(); |
530 | for_each_online_cpu(cpu) { | 512 | for_each_online_cpu(cpu) { |
531 | pevent = per_cpu_ptr(cpu_events, cpu); | ||
532 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, | 513 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, |
533 | triggered, context); | 514 | triggered, context); |
534 | |||
535 | *pevent = bp; | ||
536 | |||
537 | if (IS_ERR(bp)) { | 515 | if (IS_ERR(bp)) { |
538 | err = PTR_ERR(bp); | 516 | err = PTR_ERR(bp); |
539 | goto fail; | 517 | break; |
540 | } | 518 | } |
541 | } | ||
542 | put_online_cpus(); | ||
543 | 519 | ||
544 | return cpu_events; | 520 | per_cpu(*cpu_events, cpu) = bp; |
545 | |||
546 | fail: | ||
547 | for_each_online_cpu(cpu) { | ||
548 | pevent = per_cpu_ptr(cpu_events, cpu); | ||
549 | if (IS_ERR(*pevent)) | ||
550 | break; | ||
551 | unregister_hw_breakpoint(*pevent); | ||
552 | } | 521 | } |
553 | put_online_cpus(); | 522 | put_online_cpus(); |
554 | 523 | ||
555 | free_percpu(cpu_events); | 524 | if (likely(!err)) |
525 | return cpu_events; | ||
526 | |||
527 | unregister_wide_hw_breakpoint(cpu_events); | ||
556 | return (void __percpu __force *)ERR_PTR(err); | 528 | return (void __percpu __force *)ERR_PTR(err); |
557 | } | 529 | } |
558 | EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); | 530 | EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); |
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); | |||
564 | void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) | 536 | void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) |
565 | { | 537 | { |
566 | int cpu; | 538 | int cpu; |
567 | struct perf_event **pevent; | ||
568 | 539 | ||
569 | for_each_possible_cpu(cpu) { | 540 | for_each_possible_cpu(cpu) |
570 | pevent = per_cpu_ptr(cpu_events, cpu); | 541 | unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); |
571 | unregister_hw_breakpoint(*pevent); | 542 | |
572 | } | ||
573 | free_percpu(cpu_events); | 543 | free_percpu(cpu_events); |
574 | } | 544 | } |
575 | EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); | 545 | EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); |
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags) | |||
612 | if (!(flags & PERF_EF_START)) | 582 | if (!(flags & PERF_EF_START)) |
613 | bp->hw.state = PERF_HES_STOPPED; | 583 | bp->hw.state = PERF_HES_STOPPED; |
614 | 584 | ||
585 | if (is_sampling_event(bp)) { | ||
586 | bp->hw.last_period = bp->hw.sample_period; | ||
587 | perf_swevent_set_period(bp); | ||
588 | } | ||
589 | |||
615 | return arch_install_hw_breakpoint(bp); | 590 | return arch_install_hw_breakpoint(bp); |
616 | } | 591 | } |
617 | 592 | ||
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = { | |||
650 | 625 | ||
651 | int __init init_hw_breakpoint(void) | 626 | int __init init_hw_breakpoint(void) |
652 | { | 627 | { |
653 | unsigned int **task_bp_pinned; | ||
654 | int cpu, err_cpu; | 628 | int cpu, err_cpu; |
655 | int i; | 629 | int i; |
656 | 630 | ||
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void) | |||
659 | 633 | ||
660 | for_each_possible_cpu(cpu) { | 634 | for_each_possible_cpu(cpu) { |
661 | for (i = 0; i < TYPE_MAX; i++) { | 635 | for (i = 0; i < TYPE_MAX; i++) { |
662 | task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); | 636 | struct bp_cpuinfo *info = get_bp_info(cpu, i); |
663 | *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], | 637 | |
664 | GFP_KERNEL); | 638 | info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), |
665 | if (!*task_bp_pinned) | 639 | GFP_KERNEL); |
640 | if (!info->tsk_pinned) | ||
666 | goto err_alloc; | 641 | goto err_alloc; |
667 | } | 642 | } |
668 | } | 643 | } |
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void) | |||
676 | err_alloc: | 651 | err_alloc: |
677 | for_each_possible_cpu(err_cpu) { | 652 | for_each_possible_cpu(err_cpu) { |
678 | for (i = 0; i < TYPE_MAX; i++) | 653 | for (i = 0; i < TYPE_MAX; i++) |
679 | kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); | 654 | kfree(get_bp_info(err_cpu, i)->tsk_pinned); |
680 | if (err_cpu == cpu) | 655 | if (err_cpu == cpu) |
681 | break; | 656 | break; |
682 | } | 657 | } |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -31,6 +31,10 @@ struct ring_buffer { | |||
31 | spinlock_t event_lock; | 31 | spinlock_t event_lock; |
32 | struct list_head event_list; | 32 | struct list_head event_list; |
33 | 33 | ||
34 | atomic_t mmap_count; | ||
35 | unsigned long mmap_locked; | ||
36 | struct user_struct *mmap_user; | ||
37 | |||
34 | struct perf_event_mmap_page *user_page; | 38 | struct perf_event_mmap_page *user_page; |
35 | void *data_pages[0]; | 39 | void *data_pages[0]; |
36 | }; | 40 | }; |
diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..7bb73f9d09db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
649 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 649 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
650 | */ | 650 | */ |
651 | forget_original_parent(tsk); | 651 | forget_original_parent(tsk); |
652 | exit_task_namespaces(tsk); | ||
653 | 652 | ||
654 | write_lock_irq(&tasklist_lock); | 653 | write_lock_irq(&tasklist_lock); |
655 | if (group_dead) | 654 | if (group_dead) |
@@ -795,6 +794,7 @@ void do_exit(long code) | |||
795 | exit_shm(tsk); | 794 | exit_shm(tsk); |
796 | exit_files(tsk); | 795 | exit_files(tsk); |
797 | exit_fs(tsk); | 796 | exit_fs(tsk); |
797 | exit_task_namespaces(tsk); | ||
798 | exit_task_work(tsk); | 798 | exit_task_work(tsk); |
799 | check_stack_usage(); | 799 | check_stack_usage(); |
800 | exit_thread(); | 800 | exit_thread(); |
diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c94..c3a1a55a5214 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -61,6 +61,8 @@ | |||
61 | #include <linux/nsproxy.h> | 61 | #include <linux/nsproxy.h> |
62 | #include <linux/ptrace.h> | 62 | #include <linux/ptrace.h> |
63 | #include <linux/sched/rt.h> | 63 | #include <linux/sched/rt.h> |
64 | #include <linux/hugetlb.h> | ||
65 | #include <linux/freezer.h> | ||
64 | 66 | ||
65 | #include <asm/futex.h> | 67 | #include <asm/futex.h> |
66 | 68 | ||
@@ -365,7 +367,7 @@ again: | |||
365 | } else { | 367 | } else { |
366 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 368 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
367 | key->shared.inode = page_head->mapping->host; | 369 | key->shared.inode = page_head->mapping->host; |
368 | key->shared.pgoff = page_head->index; | 370 | key->shared.pgoff = basepage_index(page); |
369 | } | 371 | } |
370 | 372 | ||
371 | get_futex_key_refs(key); | 373 | get_futex_key_refs(key); |
@@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1807 | * is no timeout, or if it has yet to expire. | 1809 | * is no timeout, or if it has yet to expire. |
1808 | */ | 1810 | */ |
1809 | if (!timeout || timeout->task) | 1811 | if (!timeout || timeout->task) |
1810 | schedule(); | 1812 | freezable_schedule(); |
1811 | } | 1813 | } |
1812 | __set_current_state(TASK_RUNNING); | 1814 | __set_current_state(TASK_RUNNING); |
1813 | } | 1815 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b000..a3bb14fbe5c6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc) | |||
213 | irq_state_clr_masked(desc); | 213 | irq_state_clr_masked(desc); |
214 | } | 214 | } |
215 | 215 | ||
216 | /** | ||
217 | * irq_disable - Mark interupt disabled | ||
218 | * @desc: irq descriptor which should be disabled | ||
219 | * | ||
220 | * If the chip does not implement the irq_disable callback, we | ||
221 | * use a lazy disable approach. That means we mark the interrupt | ||
222 | * disabled, but leave the hardware unmasked. That's an | ||
223 | * optimization because we avoid the hardware access for the | ||
224 | * common case where no interrupt happens after we marked it | ||
225 | * disabled. If an interrupt happens, then the interrupt flow | ||
226 | * handler masks the line at the hardware level and marks it | ||
227 | * pending. | ||
228 | */ | ||
216 | void irq_disable(struct irq_desc *desc) | 229 | void irq_disable(struct irq_desc *desc) |
217 | { | 230 | { |
218 | irq_state_set_disabled(desc); | 231 | irq_state_set_disabled(desc); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f668..1c39eccc1eaf 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/irq.h> | 7 | #include <linux/irq.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/export.h> | 9 | #include <linux/export.h> |
10 | #include <linux/irqdomain.h> | ||
10 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
11 | #include <linux/kernel_stat.h> | 12 | #include <linux/kernel_stat.h> |
12 | #include <linux/syscore_ops.h> | 13 | #include <linux/syscore_ops.h> |
@@ -16,11 +17,6 @@ | |||
16 | static LIST_HEAD(gc_list); | 17 | static LIST_HEAD(gc_list); |
17 | static DEFINE_RAW_SPINLOCK(gc_lock); | 18 | static DEFINE_RAW_SPINLOCK(gc_lock); |
18 | 19 | ||
19 | static inline struct irq_chip_regs *cur_regs(struct irq_data *d) | ||
20 | { | ||
21 | return &container_of(d->chip, struct irq_chip_type, chip)->regs; | ||
22 | } | ||
23 | |||
24 | /** | 20 | /** |
25 | * irq_gc_noop - NOOP function | 21 | * irq_gc_noop - NOOP function |
26 | * @d: irq_data | 22 | * @d: irq_data |
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d) | |||
39 | void irq_gc_mask_disable_reg(struct irq_data *d) | 35 | void irq_gc_mask_disable_reg(struct irq_data *d) |
40 | { | 36 | { |
41 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 37 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
42 | u32 mask = 1 << (d->irq - gc->irq_base); | 38 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
39 | u32 mask = d->mask; | ||
43 | 40 | ||
44 | irq_gc_lock(gc); | 41 | irq_gc_lock(gc); |
45 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); | 42 | irq_reg_writel(mask, gc->reg_base + ct->regs.disable); |
46 | gc->mask_cache &= ~mask; | 43 | *ct->mask_cache &= ~mask; |
47 | irq_gc_unlock(gc); | 44 | irq_gc_unlock(gc); |
48 | } | 45 | } |
49 | 46 | ||
50 | /** | 47 | /** |
51 | * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register | 48 | * irq_gc_mask_set_bit - Mask chip via setting bit in mask register |
52 | * @d: irq_data | 49 | * @d: irq_data |
53 | * | 50 | * |
54 | * Chip has a single mask register. Values of this register are cached | 51 | * Chip has a single mask register. Values of this register are cached |
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d) | |||
57 | void irq_gc_mask_set_bit(struct irq_data *d) | 54 | void irq_gc_mask_set_bit(struct irq_data *d) |
58 | { | 55 | { |
59 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 56 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
60 | u32 mask = 1 << (d->irq - gc->irq_base); | 57 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
58 | u32 mask = d->mask; | ||
61 | 59 | ||
62 | irq_gc_lock(gc); | 60 | irq_gc_lock(gc); |
63 | gc->mask_cache |= mask; | 61 | *ct->mask_cache |= mask; |
64 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | 62 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); |
65 | irq_gc_unlock(gc); | 63 | irq_gc_unlock(gc); |
66 | } | 64 | } |
65 | EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); | ||
67 | 66 | ||
68 | /** | 67 | /** |
69 | * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register | 68 | * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register |
70 | * @d: irq_data | 69 | * @d: irq_data |
71 | * | 70 | * |
72 | * Chip has a single mask register. Values of this register are cached | 71 | * Chip has a single mask register. Values of this register are cached |
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d) | |||
75 | void irq_gc_mask_clr_bit(struct irq_data *d) | 74 | void irq_gc_mask_clr_bit(struct irq_data *d) |
76 | { | 75 | { |
77 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 76 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
78 | u32 mask = 1 << (d->irq - gc->irq_base); | 77 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
78 | u32 mask = d->mask; | ||
79 | 79 | ||
80 | irq_gc_lock(gc); | 80 | irq_gc_lock(gc); |
81 | gc->mask_cache &= ~mask; | 81 | *ct->mask_cache &= ~mask; |
82 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | 82 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); |
83 | irq_gc_unlock(gc); | 83 | irq_gc_unlock(gc); |
84 | } | 84 | } |
85 | EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); | ||
85 | 86 | ||
86 | /** | 87 | /** |
87 | * irq_gc_unmask_enable_reg - Unmask chip via enable register | 88 | * irq_gc_unmask_enable_reg - Unmask chip via enable register |
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d) | |||
93 | void irq_gc_unmask_enable_reg(struct irq_data *d) | 94 | void irq_gc_unmask_enable_reg(struct irq_data *d) |
94 | { | 95 | { |
95 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 96 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
96 | u32 mask = 1 << (d->irq - gc->irq_base); | 97 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
98 | u32 mask = d->mask; | ||
97 | 99 | ||
98 | irq_gc_lock(gc); | 100 | irq_gc_lock(gc); |
99 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); | 101 | irq_reg_writel(mask, gc->reg_base + ct->regs.enable); |
100 | gc->mask_cache |= mask; | 102 | *ct->mask_cache |= mask; |
101 | irq_gc_unlock(gc); | 103 | irq_gc_unlock(gc); |
102 | } | 104 | } |
103 | 105 | ||
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) | |||
108 | void irq_gc_ack_set_bit(struct irq_data *d) | 110 | void irq_gc_ack_set_bit(struct irq_data *d) |
109 | { | 111 | { |
110 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 112 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
111 | u32 mask = 1 << (d->irq - gc->irq_base); | 113 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
114 | u32 mask = d->mask; | ||
112 | 115 | ||
113 | irq_gc_lock(gc); | 116 | irq_gc_lock(gc); |
114 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | 117 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); |
115 | irq_gc_unlock(gc); | 118 | irq_gc_unlock(gc); |
116 | } | 119 | } |
120 | EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); | ||
117 | 121 | ||
118 | /** | 122 | /** |
119 | * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit | 123 | * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit |
@@ -122,10 +126,11 @@ void irq_gc_ack_set_bit(struct irq_data *d) | |||
122 | void irq_gc_ack_clr_bit(struct irq_data *d) | 126 | void irq_gc_ack_clr_bit(struct irq_data *d) |
123 | { | 127 | { |
124 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 128 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
125 | u32 mask = ~(1 << (d->irq - gc->irq_base)); | 129 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
130 | u32 mask = ~d->mask; | ||
126 | 131 | ||
127 | irq_gc_lock(gc); | 132 | irq_gc_lock(gc); |
128 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | 133 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); |
129 | irq_gc_unlock(gc); | 134 | irq_gc_unlock(gc); |
130 | } | 135 | } |
131 | 136 | ||
@@ -136,11 +141,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d) | |||
136 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | 141 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) |
137 | { | 142 | { |
138 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 143 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
139 | u32 mask = 1 << (d->irq - gc->irq_base); | 144 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
145 | u32 mask = d->mask; | ||
140 | 146 | ||
141 | irq_gc_lock(gc); | 147 | irq_gc_lock(gc); |
142 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); | 148 | irq_reg_writel(mask, gc->reg_base + ct->regs.mask); |
143 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | 149 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); |
144 | irq_gc_unlock(gc); | 150 | irq_gc_unlock(gc); |
145 | } | 151 | } |
146 | 152 | ||
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | |||
151 | void irq_gc_eoi(struct irq_data *d) | 157 | void irq_gc_eoi(struct irq_data *d) |
152 | { | 158 | { |
153 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 159 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
154 | u32 mask = 1 << (d->irq - gc->irq_base); | 160 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
161 | u32 mask = d->mask; | ||
155 | 162 | ||
156 | irq_gc_lock(gc); | 163 | irq_gc_lock(gc); |
157 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); | 164 | irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); |
158 | irq_gc_unlock(gc); | 165 | irq_gc_unlock(gc); |
159 | } | 166 | } |
160 | 167 | ||
161 | /** | 168 | /** |
162 | * irq_gc_set_wake - Set/clr wake bit for an interrupt | 169 | * irq_gc_set_wake - Set/clr wake bit for an interrupt |
163 | * @d: irq_data | 170 | * @d: irq_data |
171 | * @on: Indicates whether the wake bit should be set or cleared | ||
164 | * | 172 | * |
165 | * For chips where the wake from suspend functionality is not | 173 | * For chips where the wake from suspend functionality is not |
166 | * configured in a separate register and the wakeup active state is | 174 | * configured in a separate register and the wakeup active state is |
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d) | |||
169 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) | 177 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) |
170 | { | 178 | { |
171 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 179 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
172 | u32 mask = 1 << (d->irq - gc->irq_base); | 180 | u32 mask = d->mask; |
173 | 181 | ||
174 | if (!(mask & gc->wake_enabled)) | 182 | if (!(mask & gc->wake_enabled)) |
175 | return -EINVAL; | 183 | return -EINVAL; |
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) | |||
183 | return 0; | 191 | return 0; |
184 | } | 192 | } |
185 | 193 | ||
194 | static void | ||
195 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | ||
196 | int num_ct, unsigned int irq_base, | ||
197 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
198 | { | ||
199 | raw_spin_lock_init(&gc->lock); | ||
200 | gc->num_ct = num_ct; | ||
201 | gc->irq_base = irq_base; | ||
202 | gc->reg_base = reg_base; | ||
203 | gc->chip_types->chip.name = name; | ||
204 | gc->chip_types->handler = handler; | ||
205 | } | ||
206 | |||
186 | /** | 207 | /** |
187 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it | 208 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it |
188 | * @name: Name of the irq chip | 209 | * @name: Name of the irq chip |
@@ -203,23 +224,185 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | |||
203 | 224 | ||
204 | gc = kzalloc(sz, GFP_KERNEL); | 225 | gc = kzalloc(sz, GFP_KERNEL); |
205 | if (gc) { | 226 | if (gc) { |
206 | raw_spin_lock_init(&gc->lock); | 227 | irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, |
207 | gc->num_ct = num_ct; | 228 | handler); |
208 | gc->irq_base = irq_base; | ||
209 | gc->reg_base = reg_base; | ||
210 | gc->chip_types->chip.name = name; | ||
211 | gc->chip_types->handler = handler; | ||
212 | } | 229 | } |
213 | return gc; | 230 | return gc; |
214 | } | 231 | } |
215 | EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); | 232 | EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); |
216 | 233 | ||
234 | static void | ||
235 | irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | ||
236 | { | ||
237 | struct irq_chip_type *ct = gc->chip_types; | ||
238 | u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; | ||
239 | int i; | ||
240 | |||
241 | for (i = 0; i < gc->num_ct; i++) { | ||
242 | if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { | ||
243 | mskptr = &ct[i].mask_cache_priv; | ||
244 | mskreg = ct[i].regs.mask; | ||
245 | } | ||
246 | ct[i].mask_cache = mskptr; | ||
247 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
248 | *mskptr = irq_reg_readl(gc->reg_base + mskreg); | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /** | ||
253 | * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain | ||
254 | * @d: irq domain for which to allocate chips | ||
255 | * @irqs_per_chip: Number of interrupts each chip handles | ||
256 | * @num_ct: Number of irq_chip_type instances associated with this | ||
257 | * @name: Name of the irq chip | ||
258 | * @handler: Default flow handler associated with these chips | ||
259 | * @clr: IRQ_* bits to clear in the mapping function | ||
260 | * @set: IRQ_* bits to set in the mapping function | ||
261 | * @gcflags: Generic chip specific setup flags | ||
262 | */ | ||
263 | int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | ||
264 | int num_ct, const char *name, | ||
265 | irq_flow_handler_t handler, | ||
266 | unsigned int clr, unsigned int set, | ||
267 | enum irq_gc_flags gcflags) | ||
268 | { | ||
269 | struct irq_domain_chip_generic *dgc; | ||
270 | struct irq_chip_generic *gc; | ||
271 | int numchips, sz, i; | ||
272 | unsigned long flags; | ||
273 | void *tmp; | ||
274 | |||
275 | if (d->gc) | ||
276 | return -EBUSY; | ||
277 | |||
278 | if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) | ||
279 | return -EINVAL; | ||
280 | |||
281 | numchips = d->revmap_data.linear.size / irqs_per_chip; | ||
282 | if (!numchips) | ||
283 | return -EINVAL; | ||
284 | |||
285 | /* Allocate a pointer, generic chip and chiptypes for each chip */ | ||
286 | sz = sizeof(*dgc) + numchips * sizeof(gc); | ||
287 | sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); | ||
288 | |||
289 | tmp = dgc = kzalloc(sz, GFP_KERNEL); | ||
290 | if (!dgc) | ||
291 | return -ENOMEM; | ||
292 | dgc->irqs_per_chip = irqs_per_chip; | ||
293 | dgc->num_chips = numchips; | ||
294 | dgc->irq_flags_to_set = set; | ||
295 | dgc->irq_flags_to_clear = clr; | ||
296 | dgc->gc_flags = gcflags; | ||
297 | d->gc = dgc; | ||
298 | |||
299 | /* Calc pointer to the first generic chip */ | ||
300 | tmp += sizeof(*dgc) + numchips * sizeof(gc); | ||
301 | for (i = 0; i < numchips; i++) { | ||
302 | /* Store the pointer to the generic chip */ | ||
303 | dgc->gc[i] = gc = tmp; | ||
304 | irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, | ||
305 | NULL, handler); | ||
306 | gc->domain = d; | ||
307 | raw_spin_lock_irqsave(&gc_lock, flags); | ||
308 | list_add_tail(&gc->list, &gc_list); | ||
309 | raw_spin_unlock_irqrestore(&gc_lock, flags); | ||
310 | /* Calc pointer to the next generic chip */ | ||
311 | tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
312 | } | ||
313 | return 0; | ||
314 | } | ||
315 | EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | ||
316 | |||
317 | /** | ||
318 | * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq | ||
319 | * @d: irq domain pointer | ||
320 | * @hw_irq: Hardware interrupt number | ||
321 | */ | ||
322 | struct irq_chip_generic * | ||
323 | irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | ||
324 | { | ||
325 | struct irq_domain_chip_generic *dgc = d->gc; | ||
326 | int idx; | ||
327 | |||
328 | if (!dgc) | ||
329 | return NULL; | ||
330 | idx = hw_irq / dgc->irqs_per_chip; | ||
331 | if (idx >= dgc->num_chips) | ||
332 | return NULL; | ||
333 | return dgc->gc[idx]; | ||
334 | } | ||
335 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); | ||
336 | |||
217 | /* | 337 | /* |
218 | * Separate lockdep class for interrupt chip which can nest irq_desc | 338 | * Separate lockdep class for interrupt chip which can nest irq_desc |
219 | * lock. | 339 | * lock. |
220 | */ | 340 | */ |
221 | static struct lock_class_key irq_nested_lock_class; | 341 | static struct lock_class_key irq_nested_lock_class; |
222 | 342 | ||
343 | /* | ||
344 | * irq_map_generic_chip - Map a generic chip for an irq domain | ||
345 | */ | ||
346 | static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | ||
347 | irq_hw_number_t hw_irq) | ||
348 | { | ||
349 | struct irq_data *data = irq_get_irq_data(virq); | ||
350 | struct irq_domain_chip_generic *dgc = d->gc; | ||
351 | struct irq_chip_generic *gc; | ||
352 | struct irq_chip_type *ct; | ||
353 | struct irq_chip *chip; | ||
354 | unsigned long flags; | ||
355 | int idx; | ||
356 | |||
357 | if (!d->gc) | ||
358 | return -ENODEV; | ||
359 | |||
360 | idx = hw_irq / dgc->irqs_per_chip; | ||
361 | if (idx >= dgc->num_chips) | ||
362 | return -EINVAL; | ||
363 | gc = dgc->gc[idx]; | ||
364 | |||
365 | idx = hw_irq % dgc->irqs_per_chip; | ||
366 | |||
367 | if (test_bit(idx, &gc->unused)) | ||
368 | return -ENOTSUPP; | ||
369 | |||
370 | if (test_bit(idx, &gc->installed)) | ||
371 | return -EBUSY; | ||
372 | |||
373 | ct = gc->chip_types; | ||
374 | chip = &ct->chip; | ||
375 | |||
376 | /* We only init the cache for the first mapping of a generic chip */ | ||
377 | if (!gc->installed) { | ||
378 | raw_spin_lock_irqsave(&gc->lock, flags); | ||
379 | irq_gc_init_mask_cache(gc, dgc->gc_flags); | ||
380 | raw_spin_unlock_irqrestore(&gc->lock, flags); | ||
381 | } | ||
382 | |||
383 | /* Mark the interrupt as installed */ | ||
384 | set_bit(idx, &gc->installed); | ||
385 | |||
386 | if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) | ||
387 | irq_set_lockdep_class(virq, &irq_nested_lock_class); | ||
388 | |||
389 | if (chip->irq_calc_mask) | ||
390 | chip->irq_calc_mask(data); | ||
391 | else | ||
392 | data->mask = 1 << idx; | ||
393 | |||
394 | irq_set_chip_and_handler(virq, chip, ct->handler); | ||
395 | irq_set_chip_data(virq, gc); | ||
396 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); | ||
397 | return 0; | ||
398 | } | ||
399 | |||
400 | struct irq_domain_ops irq_generic_chip_ops = { | ||
401 | .map = irq_map_generic_chip, | ||
402 | .xlate = irq_domain_xlate_onetwocell, | ||
403 | }; | ||
404 | EXPORT_SYMBOL_GPL(irq_generic_chip_ops); | ||
405 | |||
223 | /** | 406 | /** |
224 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip | 407 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip |
225 | * @gc: Generic irq chip holding all data | 408 | * @gc: Generic irq chip holding all data |
@@ -237,15 +420,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
237 | unsigned int set) | 420 | unsigned int set) |
238 | { | 421 | { |
239 | struct irq_chip_type *ct = gc->chip_types; | 422 | struct irq_chip_type *ct = gc->chip_types; |
423 | struct irq_chip *chip = &ct->chip; | ||
240 | unsigned int i; | 424 | unsigned int i; |
241 | 425 | ||
242 | raw_spin_lock(&gc_lock); | 426 | raw_spin_lock(&gc_lock); |
243 | list_add_tail(&gc->list, &gc_list); | 427 | list_add_tail(&gc->list, &gc_list); |
244 | raw_spin_unlock(&gc_lock); | 428 | raw_spin_unlock(&gc_lock); |
245 | 429 | ||
246 | /* Init mask cache ? */ | 430 | irq_gc_init_mask_cache(gc, flags); |
247 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
248 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | ||
249 | 431 | ||
250 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | 432 | for (i = gc->irq_base; msk; msk >>= 1, i++) { |
251 | if (!(msk & 0x01)) | 433 | if (!(msk & 0x01)) |
@@ -254,7 +436,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
254 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | 436 | if (flags & IRQ_GC_INIT_NESTED_LOCK) |
255 | irq_set_lockdep_class(i, &irq_nested_lock_class); | 437 | irq_set_lockdep_class(i, &irq_nested_lock_class); |
256 | 438 | ||
257 | irq_set_chip_and_handler(i, &ct->chip, ct->handler); | 439 | if (!(flags & IRQ_GC_NO_MASK)) { |
440 | struct irq_data *d = irq_get_irq_data(i); | ||
441 | |||
442 | if (chip->irq_calc_mask) | ||
443 | chip->irq_calc_mask(d); | ||
444 | else | ||
445 | d->mask = 1 << (i - gc->irq_base); | ||
446 | } | ||
447 | irq_set_chip_and_handler(i, chip, ct->handler); | ||
258 | irq_set_chip_data(i, gc); | 448 | irq_set_chip_data(i, gc); |
259 | irq_modify_status(i, clr, set); | 449 | irq_modify_status(i, clr, set); |
260 | } | 450 | } |
@@ -265,7 +455,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip); | |||
265 | /** | 455 | /** |
266 | * irq_setup_alt_chip - Switch to alternative chip | 456 | * irq_setup_alt_chip - Switch to alternative chip |
267 | * @d: irq_data for this interrupt | 457 | * @d: irq_data for this interrupt |
268 | * @type Flow type to be initialized | 458 | * @type: Flow type to be initialized |
269 | * | 459 | * |
270 | * Only to be called from chip->irq_set_type() callbacks. | 460 | * Only to be called from chip->irq_set_type() callbacks. |
271 | */ | 461 | */ |
@@ -317,6 +507,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
317 | } | 507 | } |
318 | EXPORT_SYMBOL_GPL(irq_remove_generic_chip); | 508 | EXPORT_SYMBOL_GPL(irq_remove_generic_chip); |
319 | 509 | ||
510 | static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) | ||
511 | { | ||
512 | unsigned int virq; | ||
513 | |||
514 | if (!gc->domain) | ||
515 | return irq_get_irq_data(gc->irq_base); | ||
516 | |||
517 | /* | ||
518 | * We don't know which of the irqs has been actually | ||
519 | * installed. Use the first one. | ||
520 | */ | ||
521 | if (!gc->installed) | ||
522 | return NULL; | ||
523 | |||
524 | virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); | ||
525 | return virq ? irq_get_irq_data(virq) : NULL; | ||
526 | } | ||
527 | |||
320 | #ifdef CONFIG_PM | 528 | #ifdef CONFIG_PM |
321 | static int irq_gc_suspend(void) | 529 | static int irq_gc_suspend(void) |
322 | { | 530 | { |
@@ -325,8 +533,12 @@ static int irq_gc_suspend(void) | |||
325 | list_for_each_entry(gc, &gc_list, list) { | 533 | list_for_each_entry(gc, &gc_list, list) { |
326 | struct irq_chip_type *ct = gc->chip_types; | 534 | struct irq_chip_type *ct = gc->chip_types; |
327 | 535 | ||
328 | if (ct->chip.irq_suspend) | 536 | if (ct->chip.irq_suspend) { |
329 | ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); | 537 | struct irq_data *data = irq_gc_get_irq_data(gc); |
538 | |||
539 | if (data) | ||
540 | ct->chip.irq_suspend(data); | ||
541 | } | ||
330 | } | 542 | } |
331 | return 0; | 543 | return 0; |
332 | } | 544 | } |
@@ -338,8 +550,12 @@ static void irq_gc_resume(void) | |||
338 | list_for_each_entry(gc, &gc_list, list) { | 550 | list_for_each_entry(gc, &gc_list, list) { |
339 | struct irq_chip_type *ct = gc->chip_types; | 551 | struct irq_chip_type *ct = gc->chip_types; |
340 | 552 | ||
341 | if (ct->chip.irq_resume) | 553 | if (ct->chip.irq_resume) { |
342 | ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); | 554 | struct irq_data *data = irq_gc_get_irq_data(gc); |
555 | |||
556 | if (data) | ||
557 | ct->chip.irq_resume(data); | ||
558 | } | ||
343 | } | 559 | } |
344 | } | 560 | } |
345 | #else | 561 | #else |
@@ -354,8 +570,12 @@ static void irq_gc_shutdown(void) | |||
354 | list_for_each_entry(gc, &gc_list, list) { | 570 | list_for_each_entry(gc, &gc_list, list) { |
355 | struct irq_chip_type *ct = gc->chip_types; | 571 | struct irq_chip_type *ct = gc->chip_types; |
356 | 572 | ||
357 | if (ct->chip.irq_pm_shutdown) | 573 | if (ct->chip.irq_pm_shutdown) { |
358 | ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); | 574 | struct irq_data *data = irq_gc_get_irq_data(gc); |
575 | |||
576 | if (data) | ||
577 | ct->chip.irq_pm_shutdown(data); | ||
578 | } | ||
359 | } | 579 | } |
360 | } | 580 | } |
361 | 581 | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0c..1ed8dff17eb9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -16,12 +16,6 @@ | |||
16 | #include <linux/smp.h> | 16 | #include <linux/smp.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | 18 | ||
19 | #define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. | ||
20 | * ie. legacy 8259, gets irqs 1..15 */ | ||
21 | #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ | ||
22 | #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ | ||
23 | #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ | ||
24 | |||
25 | static LIST_HEAD(irq_domain_list); | 19 | static LIST_HEAD(irq_domain_list); |
26 | static DEFINE_MUTEX(irq_domain_mutex); | 20 | static DEFINE_MUTEX(irq_domain_mutex); |
27 | 21 | ||
@@ -143,7 +137,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | |||
143 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. | 137 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. |
144 | * @of_node: pointer to interrupt controller's device tree node. | 138 | * @of_node: pointer to interrupt controller's device tree node. |
145 | * @size: total number of irqs in mapping | 139 | * @size: total number of irqs in mapping |
146 | * @first_irq: first number of irq block assigned to the domain | 140 | * @first_irq: first number of irq block assigned to the domain, |
141 | * pass zero to assign irqs on-the-fly. This will result in a | ||
142 | * linear IRQ domain so it is important to use irq_create_mapping() | ||
143 | * for each used IRQ, especially when SPARSE_IRQ is enabled. | ||
147 | * @ops: map/unmap domain callbacks | 144 | * @ops: map/unmap domain callbacks |
148 | * @host_data: Controller private data pointer | 145 | * @host_data: Controller private data pointer |
149 | * | 146 | * |
@@ -191,6 +188,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
191 | /* A linear domain is the default */ | 188 | /* A linear domain is the default */ |
192 | return irq_domain_add_linear(of_node, size, ops, host_data); | 189 | return irq_domain_add_linear(of_node, size, ops, host_data); |
193 | } | 190 | } |
191 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | ||
194 | 192 | ||
195 | /** | 193 | /** |
196 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. | 194 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. |
@@ -397,11 +395,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, | |||
397 | while (count--) { | 395 | while (count--) { |
398 | int irq = irq_base + count; | 396 | int irq = irq_base + count; |
399 | struct irq_data *irq_data = irq_get_irq_data(irq); | 397 | struct irq_data *irq_data = irq_get_irq_data(irq); |
400 | irq_hw_number_t hwirq = irq_data->hwirq; | 398 | irq_hw_number_t hwirq; |
401 | 399 | ||
402 | if (WARN_ON(!irq_data || irq_data->domain != domain)) | 400 | if (WARN_ON(!irq_data || irq_data->domain != domain)) |
403 | continue; | 401 | continue; |
404 | 402 | ||
403 | hwirq = irq_data->hwirq; | ||
405 | irq_set_status_flags(irq, IRQ_NOREQUEST); | 404 | irq_set_status_flags(irq, IRQ_NOREQUEST); |
406 | 405 | ||
407 | /* remove chip and handler */ | 406 | /* remove chip and handler */ |
@@ -693,7 +692,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, | |||
693 | 692 | ||
694 | /* Set type if specified and different than the current one */ | 693 | /* Set type if specified and different than the current one */ |
695 | if (type != IRQ_TYPE_NONE && | 694 | if (type != IRQ_TYPE_NONE && |
696 | type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) | 695 | type != irq_get_trigger_type(virq)) |
697 | irq_set_irq_type(virq, type); | 696 | irq_set_irq_type(virq, type); |
698 | return virq; | 697 | return virq; |
699 | } | 698 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65a..514bcfd855a8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
555 | return 0; | 555 | return 0; |
556 | 556 | ||
557 | if (irq_settings_can_request(desc)) { | 557 | if (irq_settings_can_request(desc)) { |
558 | if (desc->action) | 558 | if (!desc->action || |
559 | if (irqflags & desc->action->flags & IRQF_SHARED) | 559 | irqflags & desc->action->flags & IRQF_SHARED) |
560 | canrequest =1; | 560 | canrequest = 1; |
561 | } | 561 | } |
562 | irq_put_desc_unlock(desc, flags); | 562 | irq_put_desc_unlock(desc, flags); |
563 | return canrequest; | 563 | return canrequest; |
@@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused) | |||
840 | static int irq_thread(void *data) | 840 | static int irq_thread(void *data) |
841 | { | 841 | { |
842 | struct callback_head on_exit_work; | 842 | struct callback_head on_exit_work; |
843 | static const struct sched_param param = { | ||
844 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
845 | }; | ||
846 | struct irqaction *action = data; | 843 | struct irqaction *action = data; |
847 | struct irq_desc *desc = irq_to_desc(action->irq); | 844 | struct irq_desc *desc = irq_to_desc(action->irq); |
848 | irqreturn_t (*handler_fn)(struct irq_desc *desc, | 845 | irqreturn_t (*handler_fn)(struct irq_desc *desc, |
@@ -854,8 +851,6 @@ static int irq_thread(void *data) | |||
854 | else | 851 | else |
855 | handler_fn = irq_thread_fn; | 852 | handler_fn = irq_thread_fn; |
856 | 853 | ||
857 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
858 | |||
859 | init_task_work(&on_exit_work, irq_thread_dtor); | 854 | init_task_work(&on_exit_work, irq_thread_dtor); |
860 | task_work_add(current, &on_exit_work, false); | 855 | task_work_add(current, &on_exit_work, false); |
861 | 856 | ||
@@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
950 | */ | 945 | */ |
951 | if (new->thread_fn && !nested) { | 946 | if (new->thread_fn && !nested) { |
952 | struct task_struct *t; | 947 | struct task_struct *t; |
948 | static const struct sched_param param = { | ||
949 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
950 | }; | ||
953 | 951 | ||
954 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, | 952 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, |
955 | new->name); | 953 | new->name); |
@@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
957 | ret = PTR_ERR(t); | 955 | ret = PTR_ERR(t); |
958 | goto out_mput; | 956 | goto out_mput; |
959 | } | 957 | } |
958 | |||
959 | sched_setscheduler(t, SCHED_FIFO, ¶m); | ||
960 | |||
960 | /* | 961 | /* |
961 | * We keep the reference to the task struct even if | 962 | * We keep the reference to the task struct even if |
962 | * the thread dies to avoid that the interrupt code | 963 | * the thread dies to avoid that the interrupt code |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..bddf3b201a48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
467 | /* Optimization staging list, protected by kprobe_mutex */ | 467 | /* Optimization staging list, protected by kprobe_mutex */ |
468 | static LIST_HEAD(optimizing_list); | 468 | static LIST_HEAD(optimizing_list); |
469 | static LIST_HEAD(unoptimizing_list); | 469 | static LIST_HEAD(unoptimizing_list); |
470 | static LIST_HEAD(freeing_list); | ||
470 | 471 | ||
471 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
472 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void) | |||
504 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | 505 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint |
505 | * if need) kprobes listed on unoptimizing_list. | 506 | * if need) kprobes listed on unoptimizing_list. |
506 | */ | 507 | */ |
507 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | 508 | static __kprobes void do_unoptimize_kprobes(void) |
508 | { | 509 | { |
509 | struct optimized_kprobe *op, *tmp; | 510 | struct optimized_kprobe *op, *tmp; |
510 | 511 | ||
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | |||
515 | /* Ditto to do_optimize_kprobes */ | 516 | /* Ditto to do_optimize_kprobes */ |
516 | get_online_cpus(); | 517 | get_online_cpus(); |
517 | mutex_lock(&text_mutex); | 518 | mutex_lock(&text_mutex); |
518 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | 519 | arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); |
519 | /* Loop free_list for disarming */ | 520 | /* Loop free_list for disarming */ |
520 | list_for_each_entry_safe(op, tmp, free_list, list) { | 521 | list_for_each_entry_safe(op, tmp, &freeing_list, list) { |
521 | /* Disarm probes if marked disabled */ | 522 | /* Disarm probes if marked disabled */ |
522 | if (kprobe_disabled(&op->kp)) | 523 | if (kprobe_disabled(&op->kp)) |
523 | arch_disarm_kprobe(&op->kp); | 524 | arch_disarm_kprobe(&op->kp); |
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | |||
536 | } | 537 | } |
537 | 538 | ||
538 | /* Reclaim all kprobes on the free_list */ | 539 | /* Reclaim all kprobes on the free_list */ |
539 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | 540 | static __kprobes void do_free_cleaned_kprobes(void) |
540 | { | 541 | { |
541 | struct optimized_kprobe *op, *tmp; | 542 | struct optimized_kprobe *op, *tmp; |
542 | 543 | ||
543 | list_for_each_entry_safe(op, tmp, free_list, list) { | 544 | list_for_each_entry_safe(op, tmp, &freeing_list, list) { |
544 | BUG_ON(!kprobe_unused(&op->kp)); | 545 | BUG_ON(!kprobe_unused(&op->kp)); |
545 | list_del_init(&op->list); | 546 | list_del_init(&op->list); |
546 | free_aggr_kprobe(&op->kp); | 547 | free_aggr_kprobe(&op->kp); |
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void) | |||
556 | /* Kprobe jump optimizer */ | 557 | /* Kprobe jump optimizer */ |
557 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 558 | static __kprobes void kprobe_optimizer(struct work_struct *work) |
558 | { | 559 | { |
559 | LIST_HEAD(free_list); | ||
560 | |||
561 | mutex_lock(&kprobe_mutex); | 560 | mutex_lock(&kprobe_mutex); |
562 | /* Lock modules while optimizing kprobes */ | 561 | /* Lock modules while optimizing kprobes */ |
563 | mutex_lock(&module_mutex); | 562 | mutex_lock(&module_mutex); |
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
566 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | 565 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) |
567 | * kprobes before waiting for quiesence period. | 566 | * kprobes before waiting for quiesence period. |
568 | */ | 567 | */ |
569 | do_unoptimize_kprobes(&free_list); | 568 | do_unoptimize_kprobes(); |
570 | 569 | ||
571 | /* | 570 | /* |
572 | * Step 2: Wait for quiesence period to ensure all running interrupts | 571 | * Step 2: Wait for quiesence period to ensure all running interrupts |
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
581 | do_optimize_kprobes(); | 580 | do_optimize_kprobes(); |
582 | 581 | ||
583 | /* Step 4: Free cleaned kprobes after quiesence period */ | 582 | /* Step 4: Free cleaned kprobes after quiesence period */ |
584 | do_free_cleaned_kprobes(&free_list); | 583 | do_free_cleaned_kprobes(); |
585 | 584 | ||
586 | mutex_unlock(&module_mutex); | 585 | mutex_unlock(&module_mutex); |
587 | mutex_unlock(&kprobe_mutex); | 586 | mutex_unlock(&kprobe_mutex); |
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) | |||
723 | if (!list_empty(&op->list)) | 722 | if (!list_empty(&op->list)) |
724 | /* Dequeue from the (un)optimization queue */ | 723 | /* Dequeue from the (un)optimization queue */ |
725 | list_del_init(&op->list); | 724 | list_del_init(&op->list); |
726 | |||
727 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 725 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
726 | |||
727 | if (kprobe_unused(p)) { | ||
728 | /* Enqueue if it is unused */ | ||
729 | list_add(&op->list, &freeing_list); | ||
730 | /* | ||
731 | * Remove unused probes from the hash list. After waiting | ||
732 | * for synchronization, this probe is reclaimed. | ||
733 | * (reclaiming is done by do_free_cleaned_kprobes().) | ||
734 | */ | ||
735 | hlist_del_rcu(&op->kp.hlist); | ||
736 | } | ||
737 | |||
728 | /* Don't touch the code, because it is already freed. */ | 738 | /* Don't touch the code, because it is already freed. */ |
729 | arch_remove_optimized_kprobe(op); | 739 | arch_remove_optimized_kprobe(op); |
730 | } | 740 | } |
diff --git a/kernel/mutex.c b/kernel/mutex.c index ad53a664f113..e581ada5faf4 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock) | |||
254 | 254 | ||
255 | EXPORT_SYMBOL(mutex_unlock); | 255 | EXPORT_SYMBOL(mutex_unlock); |
256 | 256 | ||
257 | /** | ||
258 | * ww_mutex_unlock - release the w/w mutex | ||
259 | * @lock: the mutex to be released | ||
260 | * | ||
261 | * Unlock a mutex that has been locked by this task previously with any of the | ||
262 | * ww_mutex_lock* functions (with or without an acquire context). It is | ||
263 | * forbidden to release the locks after releasing the acquire context. | ||
264 | * | ||
265 | * This function must not be used in interrupt context. Unlocking | ||
266 | * of a unlocked mutex is not allowed. | ||
267 | */ | ||
268 | void __sched ww_mutex_unlock(struct ww_mutex *lock) | ||
269 | { | ||
270 | /* | ||
271 | * The unlocking fastpath is the 0->1 transition from 'locked' | ||
272 | * into 'unlocked' state: | ||
273 | */ | ||
274 | if (lock->ctx) { | ||
275 | #ifdef CONFIG_DEBUG_MUTEXES | ||
276 | DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); | ||
277 | #endif | ||
278 | if (lock->ctx->acquired > 0) | ||
279 | lock->ctx->acquired--; | ||
280 | lock->ctx = NULL; | ||
281 | } | ||
282 | |||
283 | #ifndef CONFIG_DEBUG_MUTEXES | ||
284 | /* | ||
285 | * When debugging is enabled we must not clear the owner before time, | ||
286 | * the slow path will always be taken, and that clears the owner field | ||
287 | * after verifying that it was indeed current. | ||
288 | */ | ||
289 | mutex_clear_owner(&lock->base); | ||
290 | #endif | ||
291 | __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); | ||
292 | } | ||
293 | EXPORT_SYMBOL(ww_mutex_unlock); | ||
294 | |||
295 | static inline int __sched | ||
296 | __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | ||
297 | { | ||
298 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | ||
299 | struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); | ||
300 | |||
301 | if (!hold_ctx) | ||
302 | return 0; | ||
303 | |||
304 | if (unlikely(ctx == hold_ctx)) | ||
305 | return -EALREADY; | ||
306 | |||
307 | if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && | ||
308 | (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { | ||
309 | #ifdef CONFIG_DEBUG_MUTEXES | ||
310 | DEBUG_LOCKS_WARN_ON(ctx->contending_lock); | ||
311 | ctx->contending_lock = ww; | ||
312 | #endif | ||
313 | return -EDEADLK; | ||
314 | } | ||
315 | |||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | ||
320 | struct ww_acquire_ctx *ww_ctx) | ||
321 | { | ||
322 | #ifdef CONFIG_DEBUG_MUTEXES | ||
323 | /* | ||
324 | * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | ||
325 | * but released with a normal mutex_unlock in this call. | ||
326 | * | ||
327 | * This should never happen, always use ww_mutex_unlock. | ||
328 | */ | ||
329 | DEBUG_LOCKS_WARN_ON(ww->ctx); | ||
330 | |||
331 | /* | ||
332 | * Not quite done after calling ww_acquire_done() ? | ||
333 | */ | ||
334 | DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | ||
335 | |||
336 | if (ww_ctx->contending_lock) { | ||
337 | /* | ||
338 | * After -EDEADLK you tried to | ||
339 | * acquire a different ww_mutex? Bad! | ||
340 | */ | ||
341 | DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | ||
342 | |||
343 | /* | ||
344 | * You called ww_mutex_lock after receiving -EDEADLK, | ||
345 | * but 'forgot' to unlock everything else first? | ||
346 | */ | ||
347 | DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | ||
348 | ww_ctx->contending_lock = NULL; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Naughty, using a different class will lead to undefined behavior! | ||
353 | */ | ||
354 | DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | ||
355 | #endif | ||
356 | ww_ctx->acquired++; | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * after acquiring lock with fastpath or when we lost out in contested | ||
361 | * slowpath, set ctx and wake up any waiters so they can recheck. | ||
362 | * | ||
363 | * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, | ||
364 | * as the fastpath and opportunistic spinning are disabled in that case. | ||
365 | */ | ||
366 | static __always_inline void | ||
367 | ww_mutex_set_context_fastpath(struct ww_mutex *lock, | ||
368 | struct ww_acquire_ctx *ctx) | ||
369 | { | ||
370 | unsigned long flags; | ||
371 | struct mutex_waiter *cur; | ||
372 | |||
373 | ww_mutex_lock_acquired(lock, ctx); | ||
374 | |||
375 | lock->ctx = ctx; | ||
376 | |||
377 | /* | ||
378 | * The lock->ctx update should be visible on all cores before | ||
379 | * the atomic read is done, otherwise contended waiters might be | ||
380 | * missed. The contended waiters will either see ww_ctx == NULL | ||
381 | * and keep spinning, or it will acquire wait_lock, add itself | ||
382 | * to waiter list and sleep. | ||
383 | */ | ||
384 | smp_mb(); /* ^^^ */ | ||
385 | |||
386 | /* | ||
387 | * Check if lock is contended, if not there is nobody to wake up | ||
388 | */ | ||
389 | if (likely(atomic_read(&lock->base.count) == 0)) | ||
390 | return; | ||
391 | |||
392 | /* | ||
393 | * Uh oh, we raced in fastpath, wake up everyone in this case, | ||
394 | * so they can see the new lock->ctx. | ||
395 | */ | ||
396 | spin_lock_mutex(&lock->base.wait_lock, flags); | ||
397 | list_for_each_entry(cur, &lock->base.wait_list, list) { | ||
398 | debug_mutex_wake_waiter(&lock->base, cur); | ||
399 | wake_up_process(cur->task); | ||
400 | } | ||
401 | spin_unlock_mutex(&lock->base.wait_lock, flags); | ||
402 | } | ||
403 | |||
257 | /* | 404 | /* |
258 | * Lock a mutex (possibly interruptible), slowpath: | 405 | * Lock a mutex (possibly interruptible), slowpath: |
259 | */ | 406 | */ |
260 | static inline int __sched | 407 | static __always_inline int __sched |
261 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 408 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
262 | struct lockdep_map *nest_lock, unsigned long ip) | 409 | struct lockdep_map *nest_lock, unsigned long ip, |
410 | struct ww_acquire_ctx *ww_ctx) | ||
263 | { | 411 | { |
264 | struct task_struct *task = current; | 412 | struct task_struct *task = current; |
265 | struct mutex_waiter waiter; | 413 | struct mutex_waiter waiter; |
266 | unsigned long flags; | 414 | unsigned long flags; |
415 | int ret; | ||
267 | 416 | ||
268 | preempt_disable(); | 417 | preempt_disable(); |
269 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); | 418 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
@@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
298 | struct task_struct *owner; | 447 | struct task_struct *owner; |
299 | struct mspin_node node; | 448 | struct mspin_node node; |
300 | 449 | ||
450 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | ||
451 | struct ww_mutex *ww; | ||
452 | |||
453 | ww = container_of(lock, struct ww_mutex, base); | ||
454 | /* | ||
455 | * If ww->ctx is set the contents are undefined, only | ||
456 | * by acquiring wait_lock there is a guarantee that | ||
457 | * they are not invalid when reading. | ||
458 | * | ||
459 | * As such, when deadlock detection needs to be | ||
460 | * performed the optimistic spinning cannot be done. | ||
461 | */ | ||
462 | if (ACCESS_ONCE(ww->ctx)) | ||
463 | break; | ||
464 | } | ||
465 | |||
301 | /* | 466 | /* |
302 | * If there's an owner, wait for it to either | 467 | * If there's an owner, wait for it to either |
303 | * release the lock or go to sleep. | 468 | * release the lock or go to sleep. |
@@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
312 | if ((atomic_read(&lock->count) == 1) && | 477 | if ((atomic_read(&lock->count) == 1) && |
313 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | 478 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { |
314 | lock_acquired(&lock->dep_map, ip); | 479 | lock_acquired(&lock->dep_map, ip); |
480 | if (!__builtin_constant_p(ww_ctx == NULL)) { | ||
481 | struct ww_mutex *ww; | ||
482 | ww = container_of(lock, struct ww_mutex, base); | ||
483 | |||
484 | ww_mutex_set_context_fastpath(ww, ww_ctx); | ||
485 | } | ||
486 | |||
315 | mutex_set_owner(lock); | 487 | mutex_set_owner(lock); |
316 | mspin_unlock(MLOCK(lock), &node); | 488 | mspin_unlock(MLOCK(lock), &node); |
317 | preempt_enable(); | 489 | preempt_enable(); |
@@ -371,15 +543,16 @@ slowpath: | |||
371 | * TASK_UNINTERRUPTIBLE case.) | 543 | * TASK_UNINTERRUPTIBLE case.) |
372 | */ | 544 | */ |
373 | if (unlikely(signal_pending_state(state, task))) { | 545 | if (unlikely(signal_pending_state(state, task))) { |
374 | mutex_remove_waiter(lock, &waiter, | 546 | ret = -EINTR; |
375 | task_thread_info(task)); | 547 | goto err; |
376 | mutex_release(&lock->dep_map, 1, ip); | 548 | } |
377 | spin_unlock_mutex(&lock->wait_lock, flags); | ||
378 | 549 | ||
379 | debug_mutex_free_waiter(&waiter); | 550 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { |
380 | preempt_enable(); | 551 | ret = __mutex_lock_check_stamp(lock, ww_ctx); |
381 | return -EINTR; | 552 | if (ret) |
553 | goto err; | ||
382 | } | 554 | } |
555 | |||
383 | __set_task_state(task, state); | 556 | __set_task_state(task, state); |
384 | 557 | ||
385 | /* didn't get the lock, go to sleep: */ | 558 | /* didn't get the lock, go to sleep: */ |
@@ -394,6 +567,30 @@ done: | |||
394 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | 567 | mutex_remove_waiter(lock, &waiter, current_thread_info()); |
395 | mutex_set_owner(lock); | 568 | mutex_set_owner(lock); |
396 | 569 | ||
570 | if (!__builtin_constant_p(ww_ctx == NULL)) { | ||
571 | struct ww_mutex *ww = container_of(lock, | ||
572 | struct ww_mutex, | ||
573 | base); | ||
574 | struct mutex_waiter *cur; | ||
575 | |||
576 | /* | ||
577 | * This branch gets optimized out for the common case, | ||
578 | * and is only important for ww_mutex_lock. | ||
579 | */ | ||
580 | |||
581 | ww_mutex_lock_acquired(ww, ww_ctx); | ||
582 | ww->ctx = ww_ctx; | ||
583 | |||
584 | /* | ||
585 | * Give any possible sleeping processes the chance to wake up, | ||
586 | * so they can recheck if they have to back off. | ||
587 | */ | ||
588 | list_for_each_entry(cur, &lock->wait_list, list) { | ||
589 | debug_mutex_wake_waiter(lock, cur); | ||
590 | wake_up_process(cur->task); | ||
591 | } | ||
592 | } | ||
593 | |||
397 | /* set it to 0 if there are no waiters left: */ | 594 | /* set it to 0 if there are no waiters left: */ |
398 | if (likely(list_empty(&lock->wait_list))) | 595 | if (likely(list_empty(&lock->wait_list))) |
399 | atomic_set(&lock->count, 0); | 596 | atomic_set(&lock->count, 0); |
@@ -404,6 +601,14 @@ done: | |||
404 | preempt_enable(); | 601 | preempt_enable(); |
405 | 602 | ||
406 | return 0; | 603 | return 0; |
604 | |||
605 | err: | ||
606 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | ||
607 | spin_unlock_mutex(&lock->wait_lock, flags); | ||
608 | debug_mutex_free_waiter(&waiter); | ||
609 | mutex_release(&lock->dep_map, 1, ip); | ||
610 | preempt_enable(); | ||
611 | return ret; | ||
407 | } | 612 | } |
408 | 613 | ||
409 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 614 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
@@ -411,7 +616,8 @@ void __sched | |||
411 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | 616 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) |
412 | { | 617 | { |
413 | might_sleep(); | 618 | might_sleep(); |
414 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); | 619 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
620 | subclass, NULL, _RET_IP_, NULL); | ||
415 | } | 621 | } |
416 | 622 | ||
417 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 623 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
@@ -420,7 +626,8 @@ void __sched | |||
420 | _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | 626 | _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) |
421 | { | 627 | { |
422 | might_sleep(); | 628 | might_sleep(); |
423 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); | 629 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
630 | 0, nest, _RET_IP_, NULL); | ||
424 | } | 631 | } |
425 | 632 | ||
426 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | 633 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); |
@@ -429,7 +636,8 @@ int __sched | |||
429 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | 636 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) |
430 | { | 637 | { |
431 | might_sleep(); | 638 | might_sleep(); |
432 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); | 639 | return __mutex_lock_common(lock, TASK_KILLABLE, |
640 | subclass, NULL, _RET_IP_, NULL); | ||
433 | } | 641 | } |
434 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 642 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
435 | 643 | ||
@@ -438,10 +646,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
438 | { | 646 | { |
439 | might_sleep(); | 647 | might_sleep(); |
440 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 648 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
441 | subclass, NULL, _RET_IP_); | 649 | subclass, NULL, _RET_IP_, NULL); |
442 | } | 650 | } |
443 | 651 | ||
444 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 652 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
653 | |||
654 | static inline int | ||
655 | ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | ||
656 | { | ||
657 | #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH | ||
658 | unsigned tmp; | ||
659 | |||
660 | if (ctx->deadlock_inject_countdown-- == 0) { | ||
661 | tmp = ctx->deadlock_inject_interval; | ||
662 | if (tmp > UINT_MAX/4) | ||
663 | tmp = UINT_MAX; | ||
664 | else | ||
665 | tmp = tmp*2 + tmp + tmp/2; | ||
666 | |||
667 | ctx->deadlock_inject_interval = tmp; | ||
668 | ctx->deadlock_inject_countdown = tmp; | ||
669 | ctx->contending_lock = lock; | ||
670 | |||
671 | ww_mutex_unlock(lock); | ||
672 | |||
673 | return -EDEADLK; | ||
674 | } | ||
675 | #endif | ||
676 | |||
677 | return 0; | ||
678 | } | ||
679 | |||
680 | int __sched | ||
681 | __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | ||
682 | { | ||
683 | int ret; | ||
684 | |||
685 | might_sleep(); | ||
686 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, | ||
687 | 0, &ctx->dep_map, _RET_IP_, ctx); | ||
688 | if (!ret && ctx->acquired > 0) | ||
689 | return ww_mutex_deadlock_injection(lock, ctx); | ||
690 | |||
691 | return ret; | ||
692 | } | ||
693 | EXPORT_SYMBOL_GPL(__ww_mutex_lock); | ||
694 | |||
695 | int __sched | ||
696 | __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | ||
697 | { | ||
698 | int ret; | ||
699 | |||
700 | might_sleep(); | ||
701 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, | ||
702 | 0, &ctx->dep_map, _RET_IP_, ctx); | ||
703 | |||
704 | if (!ret && ctx->acquired > 0) | ||
705 | return ww_mutex_deadlock_injection(lock, ctx); | ||
706 | |||
707 | return ret; | ||
708 | } | ||
709 | EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); | ||
710 | |||
445 | #endif | 711 | #endif |
446 | 712 | ||
447 | /* | 713 | /* |
@@ -494,10 +760,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count) | |||
494 | * mutex_lock_interruptible() and mutex_trylock(). | 760 | * mutex_lock_interruptible() and mutex_trylock(). |
495 | */ | 761 | */ |
496 | static noinline int __sched | 762 | static noinline int __sched |
497 | __mutex_lock_killable_slowpath(atomic_t *lock_count); | 763 | __mutex_lock_killable_slowpath(struct mutex *lock); |
498 | 764 | ||
499 | static noinline int __sched | 765 | static noinline int __sched |
500 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); | 766 | __mutex_lock_interruptible_slowpath(struct mutex *lock); |
501 | 767 | ||
502 | /** | 768 | /** |
503 | * mutex_lock_interruptible - acquire the mutex, interruptible | 769 | * mutex_lock_interruptible - acquire the mutex, interruptible |
@@ -515,12 +781,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock) | |||
515 | int ret; | 781 | int ret; |
516 | 782 | ||
517 | might_sleep(); | 783 | might_sleep(); |
518 | ret = __mutex_fastpath_lock_retval | 784 | ret = __mutex_fastpath_lock_retval(&lock->count); |
519 | (&lock->count, __mutex_lock_interruptible_slowpath); | 785 | if (likely(!ret)) { |
520 | if (!ret) | ||
521 | mutex_set_owner(lock); | 786 | mutex_set_owner(lock); |
522 | 787 | return 0; | |
523 | return ret; | 788 | } else |
789 | return __mutex_lock_interruptible_slowpath(lock); | ||
524 | } | 790 | } |
525 | 791 | ||
526 | EXPORT_SYMBOL(mutex_lock_interruptible); | 792 | EXPORT_SYMBOL(mutex_lock_interruptible); |
@@ -530,12 +796,12 @@ int __sched mutex_lock_killable(struct mutex *lock) | |||
530 | int ret; | 796 | int ret; |
531 | 797 | ||
532 | might_sleep(); | 798 | might_sleep(); |
533 | ret = __mutex_fastpath_lock_retval | 799 | ret = __mutex_fastpath_lock_retval(&lock->count); |
534 | (&lock->count, __mutex_lock_killable_slowpath); | 800 | if (likely(!ret)) { |
535 | if (!ret) | ||
536 | mutex_set_owner(lock); | 801 | mutex_set_owner(lock); |
537 | 802 | return 0; | |
538 | return ret; | 803 | } else |
804 | return __mutex_lock_killable_slowpath(lock); | ||
539 | } | 805 | } |
540 | EXPORT_SYMBOL(mutex_lock_killable); | 806 | EXPORT_SYMBOL(mutex_lock_killable); |
541 | 807 | ||
@@ -544,24 +810,39 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
544 | { | 810 | { |
545 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 811 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
546 | 812 | ||
547 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); | 813 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, |
814 | NULL, _RET_IP_, NULL); | ||
548 | } | 815 | } |
549 | 816 | ||
550 | static noinline int __sched | 817 | static noinline int __sched |
551 | __mutex_lock_killable_slowpath(atomic_t *lock_count) | 818 | __mutex_lock_killable_slowpath(struct mutex *lock) |
552 | { | 819 | { |
553 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 820 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, |
821 | NULL, _RET_IP_, NULL); | ||
822 | } | ||
554 | 823 | ||
555 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); | 824 | static noinline int __sched |
825 | __mutex_lock_interruptible_slowpath(struct mutex *lock) | ||
826 | { | ||
827 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, | ||
828 | NULL, _RET_IP_, NULL); | ||
556 | } | 829 | } |
557 | 830 | ||
558 | static noinline int __sched | 831 | static noinline int __sched |
559 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | 832 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) |
560 | { | 833 | { |
561 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 834 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, |
835 | NULL, _RET_IP_, ctx); | ||
836 | } | ||
562 | 837 | ||
563 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); | 838 | static noinline int __sched |
839 | __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, | ||
840 | struct ww_acquire_ctx *ctx) | ||
841 | { | ||
842 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, | ||
843 | NULL, _RET_IP_, ctx); | ||
564 | } | 844 | } |
845 | |||
565 | #endif | 846 | #endif |
566 | 847 | ||
567 | /* | 848 | /* |
@@ -617,6 +898,45 @@ int __sched mutex_trylock(struct mutex *lock) | |||
617 | } | 898 | } |
618 | EXPORT_SYMBOL(mutex_trylock); | 899 | EXPORT_SYMBOL(mutex_trylock); |
619 | 900 | ||
901 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | ||
902 | int __sched | ||
903 | __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | ||
904 | { | ||
905 | int ret; | ||
906 | |||
907 | might_sleep(); | ||
908 | |||
909 | ret = __mutex_fastpath_lock_retval(&lock->base.count); | ||
910 | |||
911 | if (likely(!ret)) { | ||
912 | ww_mutex_set_context_fastpath(lock, ctx); | ||
913 | mutex_set_owner(&lock->base); | ||
914 | } else | ||
915 | ret = __ww_mutex_lock_slowpath(lock, ctx); | ||
916 | return ret; | ||
917 | } | ||
918 | EXPORT_SYMBOL(__ww_mutex_lock); | ||
919 | |||
920 | int __sched | ||
921 | __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | ||
922 | { | ||
923 | int ret; | ||
924 | |||
925 | might_sleep(); | ||
926 | |||
927 | ret = __mutex_fastpath_lock_retval(&lock->base.count); | ||
928 | |||
929 | if (likely(!ret)) { | ||
930 | ww_mutex_set_context_fastpath(lock, ctx); | ||
931 | mutex_set_owner(&lock->base); | ||
932 | } else | ||
933 | ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); | ||
934 | return ret; | ||
935 | } | ||
936 | EXPORT_SYMBOL(__ww_mutex_lock_interruptible); | ||
937 | |||
938 | #endif | ||
939 | |||
620 | /** | 940 | /** |
621 | * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 | 941 | * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 |
622 | * @cnt: the atomic which we are to dec | 942 | * @cnt: the atomic which we are to dec |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..d444c4e834f4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -100,7 +100,6 @@ config PM_SLEEP_SMP | |||
100 | depends on SMP | 100 | depends on SMP |
101 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | 101 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE |
102 | depends on PM_SLEEP | 102 | depends on PM_SLEEP |
103 | select HOTPLUG | ||
104 | select HOTPLUG_CPU | 103 | select HOTPLUG_CPU |
105 | 104 | ||
106 | config PM_AUTOSLEEP | 105 | config PM_AUTOSLEEP |
@@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS | |||
263 | bool | 262 | bool |
264 | depends on PM | 263 | depends on PM |
265 | 264 | ||
265 | config WQ_POWER_EFFICIENT_DEFAULT | ||
266 | bool "Enable workqueue power-efficient mode by default" | ||
267 | depends on PM | ||
268 | default n | ||
269 | help | ||
270 | Per-cpu workqueues are generally preferred because they show | ||
271 | better performance thanks to cache locality; unfortunately, | ||
272 | per-cpu workqueues tend to be more power hungry than unbound | ||
273 | workqueues. | ||
274 | |||
275 | Enabling workqueue.power_efficient kernel parameter makes the | ||
276 | per-cpu workqueues which were observed to contribute | ||
277 | significantly to power consumption unbound, leading to measurably | ||
278 | lower power usage at the cost of small performance overhead. | ||
279 | |||
280 | This config option determines whether workqueue.power_efficient | ||
281 | is enabled by default. | ||
282 | |||
283 | If in doubt, say N. | ||
284 | |||
266 | config PM_GENERIC_DOMAINS_SLEEP | 285 | config PM_GENERIC_DOMAINS_SLEEP |
267 | def_bool y | 286 | def_bool y |
268 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | 287 | depends on PM_SLEEP && PM_GENERIC_DOMAINS |
diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -363,6 +363,53 @@ static void log_store(int facility, int level, | |||
363 | log_next_seq++; | 363 | log_next_seq++; |
364 | } | 364 | } |
365 | 365 | ||
366 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | ||
367 | int dmesg_restrict = 1; | ||
368 | #else | ||
369 | int dmesg_restrict; | ||
370 | #endif | ||
371 | |||
372 | static int syslog_action_restricted(int type) | ||
373 | { | ||
374 | if (dmesg_restrict) | ||
375 | return 1; | ||
376 | /* | ||
377 | * Unless restricted, we allow "read all" and "get buffer size" | ||
378 | * for everybody. | ||
379 | */ | ||
380 | return type != SYSLOG_ACTION_READ_ALL && | ||
381 | type != SYSLOG_ACTION_SIZE_BUFFER; | ||
382 | } | ||
383 | |||
384 | static int check_syslog_permissions(int type, bool from_file) | ||
385 | { | ||
386 | /* | ||
387 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
388 | * already done the capabilities checks at open time. | ||
389 | */ | ||
390 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
391 | return 0; | ||
392 | |||
393 | if (syslog_action_restricted(type)) { | ||
394 | if (capable(CAP_SYSLOG)) | ||
395 | return 0; | ||
396 | /* | ||
397 | * For historical reasons, accept CAP_SYS_ADMIN too, with | ||
398 | * a warning. | ||
399 | */ | ||
400 | if (capable(CAP_SYS_ADMIN)) { | ||
401 | pr_warn_once("%s (%d): Attempt to access syslog with " | ||
402 | "CAP_SYS_ADMIN but no CAP_SYSLOG " | ||
403 | "(deprecated).\n", | ||
404 | current->comm, task_pid_nr(current)); | ||
405 | return 0; | ||
406 | } | ||
407 | return -EPERM; | ||
408 | } | ||
409 | return security_syslog(type); | ||
410 | } | ||
411 | |||
412 | |||
366 | /* /dev/kmsg - userspace message inject/listen interface */ | 413 | /* /dev/kmsg - userspace message inject/listen interface */ |
367 | struct devkmsg_user { | 414 | struct devkmsg_user { |
368 | u64 seq; | 415 | u64 seq; |
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) | |||
620 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | 667 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) |
621 | return 0; | 668 | return 0; |
622 | 669 | ||
623 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | 670 | err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, |
671 | SYSLOG_FROM_READER); | ||
624 | if (err) | 672 | if (err) |
625 | return err; | 673 | return err; |
626 | 674 | ||
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) | |||
813 | } | 861 | } |
814 | #endif | 862 | #endif |
815 | 863 | ||
816 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | ||
817 | int dmesg_restrict = 1; | ||
818 | #else | ||
819 | int dmesg_restrict; | ||
820 | #endif | ||
821 | |||
822 | static int syslog_action_restricted(int type) | ||
823 | { | ||
824 | if (dmesg_restrict) | ||
825 | return 1; | ||
826 | /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ | ||
827 | return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; | ||
828 | } | ||
829 | |||
830 | static int check_syslog_permissions(int type, bool from_file) | ||
831 | { | ||
832 | /* | ||
833 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
834 | * already done the capabilities checks at open time. | ||
835 | */ | ||
836 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
837 | return 0; | ||
838 | |||
839 | if (syslog_action_restricted(type)) { | ||
840 | if (capable(CAP_SYSLOG)) | ||
841 | return 0; | ||
842 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | ||
843 | if (capable(CAP_SYS_ADMIN)) { | ||
844 | printk_once(KERN_WARNING "%s (%d): " | ||
845 | "Attempt to access syslog with CAP_SYS_ADMIN " | ||
846 | "but no CAP_SYSLOG (deprecated).\n", | ||
847 | current->comm, task_pid_nr(current)); | ||
848 | return 0; | ||
849 | } | ||
850 | return -EPERM; | ||
851 | } | ||
852 | return 0; | ||
853 | } | ||
854 | |||
855 | #if defined(CONFIG_PRINTK_TIME) | 864 | #if defined(CONFIG_PRINTK_TIME) |
856 | static bool printk_time = 1; | 865 | static bool printk_time = 1; |
857 | #else | 866 | #else |
@@ -1249,7 +1258,7 @@ out: | |||
1249 | 1258 | ||
1250 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | 1259 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) |
1251 | { | 1260 | { |
1252 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1261 | return do_syslog(type, buf, len, SYSLOG_FROM_READER); |
1253 | } | 1262 | } |
1254 | 1263 | ||
1255 | /* | 1264 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..335a7ae697f5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child, | |||
665 | if (unlikely(is_compat_task())) { | 665 | if (unlikely(is_compat_task())) { |
666 | compat_siginfo_t __user *uinfo = compat_ptr(data); | 666 | compat_siginfo_t __user *uinfo = compat_ptr(data); |
667 | 667 | ||
668 | ret = copy_siginfo_to_user32(uinfo, &info); | 668 | if (copy_siginfo_to_user32(uinfo, &info) || |
669 | ret |= __put_user(info.si_code, &uinfo->si_code); | 669 | __put_user(info.si_code, &uinfo->si_code)) { |
670 | ret = -EFAULT; | ||
671 | break; | ||
672 | } | ||
673 | |||
670 | } else | 674 | } else |
671 | #endif | 675 | #endif |
672 | { | 676 | { |
673 | siginfo_t __user *uinfo = (siginfo_t __user *) data; | 677 | siginfo_t __user *uinfo = (siginfo_t __user *) data; |
674 | 678 | ||
675 | ret = copy_siginfo_to_user(uinfo, &info); | 679 | if (copy_siginfo_to_user(uinfo, &info) || |
676 | ret |= __put_user(info.si_code, &uinfo->si_code); | 680 | __put_user(info.si_code, &uinfo->si_code)) { |
677 | } | 681 | ret = -EFAULT; |
678 | 682 | break; | |
679 | if (ret) { | 683 | } |
680 | ret = -EFAULT; | ||
681 | break; | ||
682 | } | 684 | } |
683 | 685 | ||
684 | data += sizeof(siginfo_t); | 686 | data += sizeof(siginfo_t); |
diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -4,7 +4,7 @@ | |||
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/init.h> | 5 | #include <linux/init.h> |
6 | #include <linux/sort.h> | 6 | #include <linux/sort.h> |
7 | 7 | #include <linux/string.h> | |
8 | #include <linux/range.h> | 8 | #include <linux/range.h> |
9 | 9 | ||
10 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | 10 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) |
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, | |||
32 | if (start >= end) | 32 | if (start >= end) |
33 | return nr_range; | 33 | return nr_range; |
34 | 34 | ||
35 | /* Try to merge it with old one: */ | 35 | /* get new start/end: */ |
36 | for (i = 0; i < nr_range; i++) { | 36 | for (i = 0; i < nr_range; i++) { |
37 | u64 final_start, final_end; | ||
38 | u64 common_start, common_end; | 37 | u64 common_start, common_end; |
39 | 38 | ||
40 | if (!range[i].end) | 39 | if (!range[i].end) |
@@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, | |||
45 | if (common_start > common_end) | 44 | if (common_start > common_end) |
46 | continue; | 45 | continue; |
47 | 46 | ||
48 | final_start = min(range[i].start, start); | 47 | /* new start/end, will add it back at last */ |
49 | final_end = max(range[i].end, end); | 48 | start = min(range[i].start, start); |
49 | end = max(range[i].end, end); | ||
50 | 50 | ||
51 | /* clear it and add it back for further merge */ | 51 | memmove(&range[i], &range[i + 1], |
52 | range[i].start = 0; | 52 | (nr_range - (i + 1)) * sizeof(range[i])); |
53 | range[i].end = 0; | 53 | range[nr_range - 1].start = 0; |
54 | return add_range_with_merge(range, az, nr_range, | 54 | range[nr_range - 1].end = 0; |
55 | final_start, final_end); | 55 | nr_range--; |
56 | i--; | ||
56 | } | 57 | } |
57 | 58 | ||
58 | /* Need to add it: */ | 59 | /* Need to add it: */ |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4c..cce6ba8bbace 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void) | |||
104 | } | 104 | } |
105 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 105 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
106 | 106 | ||
107 | /* | 107 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
108 | * Check for a task exiting while in a preemptible-RCU read-side | ||
109 | * critical section, clean up if so. No need to issue warnings, | ||
110 | * as debug_check_no_locks_held() already does this if lockdep | ||
111 | * is enabled. | ||
112 | */ | ||
113 | void exit_rcu(void) | ||
114 | { | ||
115 | struct task_struct *t = current; | ||
116 | |||
117 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
118 | return; | ||
119 | t->rcu_read_lock_nesting = 1; | ||
120 | barrier(); | ||
121 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
122 | __rcu_read_unlock(); | ||
123 | } | ||
124 | |||
125 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
126 | |||
127 | void exit_rcu(void) | ||
128 | { | ||
129 | } | ||
130 | |||
131 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
132 | 108 | ||
133 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 109 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
134 | static struct lock_class_key rcu_lock_key; | 110 | static struct lock_class_key rcu_lock_key; |
@@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key; | |||
145 | struct lockdep_map rcu_sched_lock_map = | 121 | struct lockdep_map rcu_sched_lock_map = |
146 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | 122 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); |
147 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | 123 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); |
148 | #endif | ||
149 | |||
150 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
151 | 124 | ||
152 | int debug_lockdep_rcu_enabled(void) | 125 | int debug_lockdep_rcu_enabled(void) |
153 | { | 126 | { |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d7..aa344111de3e 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -44,7 +44,6 @@ | |||
44 | 44 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 46 | struct rcu_ctrlblk; |
47 | static void invoke_rcu_callbacks(void); | ||
48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 47 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
49 | static void rcu_process_callbacks(struct softirq_action *unused); | 48 | static void rcu_process_callbacks(struct softirq_action *unused); |
50 | static void __call_rcu(struct rcu_head *head, | 49 | static void __call_rcu(struct rcu_head *head, |
@@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
205 | */ | 204 | */ |
206 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 205 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
207 | { | 206 | { |
208 | reset_cpu_stall_ticks(rcp); | 207 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); |
209 | if (rcp->rcucblist != NULL && | 208 | if (rcp->rcucblist != NULL && |
210 | rcp->donetail != rcp->curtail) { | 209 | rcp->donetail != rcp->curtail) { |
211 | rcp->donetail = rcp->curtail; | 210 | rcp->donetail = rcp->curtail; |
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu) | |||
227 | local_irq_save(flags); | 226 | local_irq_save(flags); |
228 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 227 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
229 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 228 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
230 | invoke_rcu_callbacks(); | 229 | raise_softirq(RCU_SOFTIRQ); |
231 | local_irq_restore(flags); | 230 | local_irq_restore(flags); |
232 | } | 231 | } |
233 | 232 | ||
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu) | |||
240 | 239 | ||
241 | local_irq_save(flags); | 240 | local_irq_save(flags); |
242 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 241 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
243 | invoke_rcu_callbacks(); | 242 | raise_softirq(RCU_SOFTIRQ); |
244 | local_irq_restore(flags); | 243 | local_irq_restore(flags); |
245 | } | 244 | } |
246 | 245 | ||
@@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu) | |||
252 | */ | 251 | */ |
253 | void rcu_check_callbacks(int cpu, int user) | 252 | void rcu_check_callbacks(int cpu, int user) |
254 | { | 253 | { |
255 | check_cpu_stalls(); | 254 | RCU_TRACE(check_cpu_stalls()); |
256 | if (user || rcu_is_cpu_rrupt_from_idle()) | 255 | if (user || rcu_is_cpu_rrupt_from_idle()) |
257 | rcu_sched_qs(cpu); | 256 | rcu_sched_qs(cpu); |
258 | else if (!in_softirq()) | 257 | else if (!in_softirq()) |
259 | rcu_bh_qs(cpu); | 258 | rcu_bh_qs(cpu); |
260 | rcu_preempt_check_callbacks(); | ||
261 | } | 259 | } |
262 | 260 | ||
263 | /* | 261 | /* |
@@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
278 | ACCESS_ONCE(rcp->rcucblist), | 276 | ACCESS_ONCE(rcp->rcucblist), |
279 | need_resched(), | 277 | need_resched(), |
280 | is_idle_task(current), | 278 | is_idle_task(current), |
281 | rcu_is_callbacks_kthread())); | 279 | false)); |
282 | return; | 280 | return; |
283 | } | 281 | } |
284 | 282 | ||
@@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
290 | *rcp->donetail = NULL; | 288 | *rcp->donetail = NULL; |
291 | if (rcp->curtail == rcp->donetail) | 289 | if (rcp->curtail == rcp->donetail) |
292 | rcp->curtail = &rcp->rcucblist; | 290 | rcp->curtail = &rcp->rcucblist; |
293 | rcu_preempt_remove_callbacks(rcp); | ||
294 | rcp->donetail = &rcp->rcucblist; | 291 | rcp->donetail = &rcp->rcucblist; |
295 | local_irq_restore(flags); | 292 | local_irq_restore(flags); |
296 | 293 | ||
@@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
309 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 306 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
310 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), | 307 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), |
311 | is_idle_task(current), | 308 | is_idle_task(current), |
312 | rcu_is_callbacks_kthread())); | 309 | false)); |
313 | } | 310 | } |
314 | 311 | ||
315 | static void rcu_process_callbacks(struct softirq_action *unused) | 312 | static void rcu_process_callbacks(struct softirq_action *unused) |
316 | { | 313 | { |
317 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 314 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
318 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 315 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
319 | rcu_preempt_process_callbacks(); | ||
320 | } | 316 | } |
321 | 317 | ||
322 | /* | 318 | /* |
@@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
382 | __call_rcu(head, func, &rcu_bh_ctrlblk); | 378 | __call_rcu(head, func, &rcu_bh_ctrlblk); |
383 | } | 379 | } |
384 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 380 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
381 | |||
382 | void rcu_init(void) | ||
383 | { | ||
384 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
385 | } | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002faeb..0cd385acccfa 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |||
53 | }; | 53 | }; |
54 | 54 | ||
55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
56 | #include <linux/kernel_stat.h> | ||
57 | |||
56 | int rcu_scheduler_active __read_mostly; | 58 | int rcu_scheduler_active __read_mostly; |
57 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
58 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
59 | |||
60 | #ifdef CONFIG_RCU_TRACE | ||
61 | |||
62 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
63 | { | ||
64 | unsigned long j; | ||
65 | unsigned long js; | ||
66 | |||
67 | if (rcu_cpu_stall_suppress) | ||
68 | return; | ||
69 | rcp->ticks_this_gp++; | ||
70 | j = jiffies; | ||
71 | js = rcp->jiffies_stall; | ||
72 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | ||
73 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | ||
74 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | ||
75 | jiffies - rcp->gp_start, rcp->qlen); | ||
76 | dump_stack(); | ||
77 | } | ||
78 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | ||
79 | rcp->jiffies_stall = jiffies + | ||
80 | 3 * rcu_jiffies_till_stall_check() + 3; | ||
81 | else if (ULONG_CMP_GE(j, js)) | ||
82 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
83 | } | ||
84 | |||
85 | static void check_cpu_stall_preempt(void); | ||
86 | |||
87 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
88 | |||
89 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | ||
90 | { | ||
91 | #ifdef CONFIG_RCU_TRACE | ||
92 | rcp->ticks_this_gp = 0; | ||
93 | rcp->gp_start = jiffies; | ||
94 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
95 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
96 | } | ||
97 | |||
98 | static void check_cpu_stalls(void) | ||
99 | { | ||
100 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | ||
101 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | ||
102 | RCU_TRACE(check_cpu_stall_preempt()); | ||
103 | } | ||
104 | |||
105 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
106 | |||
107 | #include <linux/delay.h> | ||
108 | |||
109 | /* Global control variables for preemptible RCU. */ | ||
110 | struct rcu_preempt_ctrlblk { | ||
111 | struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ | ||
112 | struct rcu_head **nexttail; | ||
113 | /* Tasks blocked in a preemptible RCU */ | ||
114 | /* read-side critical section while an */ | ||
115 | /* preemptible-RCU grace period is in */ | ||
116 | /* progress must wait for a later grace */ | ||
117 | /* period. This pointer points to the */ | ||
118 | /* ->next pointer of the last task that */ | ||
119 | /* must wait for a later grace period, or */ | ||
120 | /* to &->rcb.rcucblist if there is no */ | ||
121 | /* such task. */ | ||
122 | struct list_head blkd_tasks; | ||
123 | /* Tasks blocked in RCU read-side critical */ | ||
124 | /* section. Tasks are placed at the head */ | ||
125 | /* of this list and age towards the tail. */ | ||
126 | struct list_head *gp_tasks; | ||
127 | /* Pointer to the first task blocking the */ | ||
128 | /* current grace period, or NULL if there */ | ||
129 | /* is no such task. */ | ||
130 | struct list_head *exp_tasks; | ||
131 | /* Pointer to first task blocking the */ | ||
132 | /* current expedited grace period, or NULL */ | ||
133 | /* if there is no such task. If there */ | ||
134 | /* is no current expedited grace period, */ | ||
135 | /* then there cannot be any such task. */ | ||
136 | #ifdef CONFIG_RCU_BOOST | ||
137 | struct list_head *boost_tasks; | ||
138 | /* Pointer to first task that needs to be */ | ||
139 | /* priority-boosted, or NULL if no priority */ | ||
140 | /* boosting is needed. If there is no */ | ||
141 | /* current or expedited grace period, there */ | ||
142 | /* can be no such task. */ | ||
143 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
144 | u8 gpnum; /* Current grace period. */ | ||
145 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | ||
146 | u8 completed; /* Last grace period completed. */ | ||
147 | /* If all three are equal, RCU is idle. */ | ||
148 | #ifdef CONFIG_RCU_BOOST | ||
149 | unsigned long boost_time; /* When to start boosting (jiffies) */ | ||
150 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
151 | #ifdef CONFIG_RCU_TRACE | ||
152 | unsigned long n_grace_periods; | ||
153 | #ifdef CONFIG_RCU_BOOST | ||
154 | unsigned long n_tasks_boosted; | ||
155 | /* Total number of tasks boosted. */ | ||
156 | unsigned long n_exp_boosts; | ||
157 | /* Number of tasks boosted for expedited GP. */ | ||
158 | unsigned long n_normal_boosts; | ||
159 | /* Number of tasks boosted for normal GP. */ | ||
160 | unsigned long n_balk_blkd_tasks; | ||
161 | /* Refused to boost: no blocked tasks. */ | ||
162 | unsigned long n_balk_exp_gp_tasks; | ||
163 | /* Refused to boost: nothing blocking GP. */ | ||
164 | unsigned long n_balk_boost_tasks; | ||
165 | /* Refused to boost: already boosting. */ | ||
166 | unsigned long n_balk_notyet; | ||
167 | /* Refused to boost: not yet time. */ | ||
168 | unsigned long n_balk_nos; | ||
169 | /* Refused to boost: not sure why, though. */ | ||
170 | /* This can happen due to race conditions. */ | ||
171 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
172 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
173 | }; | ||
174 | |||
175 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | ||
176 | .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
177 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
178 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
179 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | ||
180 | RCU_TRACE(.rcb.name = "rcu_preempt") | ||
181 | }; | ||
182 | |||
183 | static int rcu_preempted_readers_exp(void); | ||
184 | static void rcu_report_exp_done(void); | ||
185 | |||
186 | /* | ||
187 | * Return true if the CPU has not yet responded to the current grace period. | ||
188 | */ | ||
189 | static int rcu_cpu_blocking_cur_gp(void) | ||
190 | { | ||
191 | return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * Check for a running RCU reader. Because there is only one CPU, | ||
196 | * there can be but one running RCU reader at a time. ;-) | ||
197 | * | ||
198 | * Returns zero if there are no running readers. Returns a positive | ||
199 | * number if there is at least one reader within its RCU read-side | ||
200 | * critical section. Returns a negative number if an outermost reader | ||
201 | * is in the midst of exiting from its RCU read-side critical section | ||
202 | * | ||
203 | * Returns zero if there are no running readers. Returns a positive | ||
204 | * number if there is at least one reader within its RCU read-side | ||
205 | * critical section. Returns a negative number if an outermost reader | ||
206 | * is in the midst of exiting from its RCU read-side critical section. | ||
207 | */ | ||
208 | static int rcu_preempt_running_reader(void) | ||
209 | { | ||
210 | return current->rcu_read_lock_nesting; | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Check for preempted RCU readers blocking any grace period. | ||
215 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
216 | */ | ||
217 | static int rcu_preempt_blocked_readers_any(void) | ||
218 | { | ||
219 | return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * Check for preempted RCU readers blocking the current grace period. | ||
224 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
225 | */ | ||
226 | static int rcu_preempt_blocked_readers_cgp(void) | ||
227 | { | ||
228 | return rcu_preempt_ctrlblk.gp_tasks != NULL; | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * Return true if another preemptible-RCU grace period is needed. | ||
233 | */ | ||
234 | static int rcu_preempt_needs_another_gp(void) | ||
235 | { | ||
236 | return *rcu_preempt_ctrlblk.rcb.curtail != NULL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Return true if a preemptible-RCU grace period is in progress. | ||
241 | * The caller must disable hardirqs. | ||
242 | */ | ||
243 | static int rcu_preempt_gp_in_progress(void) | ||
244 | { | ||
245 | return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
250 | * returning NULL if at the end of the list. | ||
251 | */ | ||
252 | static struct list_head *rcu_next_node_entry(struct task_struct *t) | ||
253 | { | ||
254 | struct list_head *np; | ||
255 | |||
256 | np = t->rcu_node_entry.next; | ||
257 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
258 | np = NULL; | ||
259 | return np; | ||
260 | } | ||
261 | |||
262 | #ifdef CONFIG_RCU_TRACE | ||
263 | |||
264 | #ifdef CONFIG_RCU_BOOST | ||
265 | static void rcu_initiate_boost_trace(void); | ||
266 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
267 | |||
268 | /* | ||
269 | * Dump additional statistice for TINY_PREEMPT_RCU. | ||
270 | */ | ||
271 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
272 | { | ||
273 | seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | ||
274 | rcu_preempt_ctrlblk.rcb.qlen, | ||
275 | rcu_preempt_ctrlblk.n_grace_periods, | ||
276 | rcu_preempt_ctrlblk.gpnum, | ||
277 | rcu_preempt_ctrlblk.gpcpu, | ||
278 | rcu_preempt_ctrlblk.completed, | ||
279 | "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | ||
280 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | ||
281 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | ||
282 | #ifdef CONFIG_RCU_BOOST | ||
283 | seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
284 | " ", | ||
285 | "B."[!rcu_preempt_ctrlblk.boost_tasks], | ||
286 | rcu_preempt_ctrlblk.n_tasks_boosted, | ||
287 | rcu_preempt_ctrlblk.n_exp_boosts, | ||
288 | rcu_preempt_ctrlblk.n_normal_boosts, | ||
289 | (int)(jiffies & 0xffff), | ||
290 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | ||
291 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", | ||
292 | " balk", | ||
293 | rcu_preempt_ctrlblk.n_balk_blkd_tasks, | ||
294 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, | ||
295 | rcu_preempt_ctrlblk.n_balk_boost_tasks, | ||
296 | rcu_preempt_ctrlblk.n_balk_notyet, | ||
297 | rcu_preempt_ctrlblk.n_balk_nos); | ||
298 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
299 | } | ||
300 | |||
301 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
302 | |||
303 | #ifdef CONFIG_RCU_BOOST | ||
304 | |||
305 | #include "rtmutex_common.h" | ||
306 | |||
307 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
308 | |||
309 | /* Controls for rcu_kthread() kthread. */ | ||
310 | static struct task_struct *rcu_kthread_task; | ||
311 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | ||
312 | static unsigned long have_rcu_kthread_work; | ||
313 | |||
314 | /* | ||
315 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | ||
316 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | ||
317 | */ | ||
318 | static int rcu_boost(void) | ||
319 | { | ||
320 | unsigned long flags; | ||
321 | struct rt_mutex mtx; | ||
322 | struct task_struct *t; | ||
323 | struct list_head *tb; | ||
324 | |||
325 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
326 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
327 | return 0; /* Nothing to boost. */ | ||
328 | |||
329 | local_irq_save(flags); | ||
330 | |||
331 | /* | ||
332 | * Recheck with irqs disabled: all tasks in need of boosting | ||
333 | * might exit their RCU read-side critical sections on their own | ||
334 | * if we are preempted just before disabling irqs. | ||
335 | */ | ||
336 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
337 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
338 | local_irq_restore(flags); | ||
339 | return 0; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * Preferentially boost tasks blocking expedited grace periods. | ||
344 | * This cannot starve the normal grace periods because a second | ||
345 | * expedited grace period must boost all blocked tasks, including | ||
346 | * those blocking the pre-existing normal grace period. | ||
347 | */ | ||
348 | if (rcu_preempt_ctrlblk.exp_tasks != NULL) { | ||
349 | tb = rcu_preempt_ctrlblk.exp_tasks; | ||
350 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
351 | } else { | ||
352 | tb = rcu_preempt_ctrlblk.boost_tasks; | ||
353 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
354 | } | ||
355 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
356 | |||
357 | /* | ||
358 | * We boost task t by manufacturing an rt_mutex that appears to | ||
359 | * be held by task t. We leave a pointer to that rt_mutex where | ||
360 | * task t can find it, and task t will release the mutex when it | ||
361 | * exits its outermost RCU read-side critical section. Then | ||
362 | * simply acquiring this artificial rt_mutex will boost task | ||
363 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
364 | */ | ||
365 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
366 | rt_mutex_init_proxy_locked(&mtx, t); | ||
367 | t->rcu_boost_mutex = &mtx; | ||
368 | local_irq_restore(flags); | ||
369 | rt_mutex_lock(&mtx); | ||
370 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
371 | |||
372 | return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || | ||
373 | ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * Check to see if it is now time to start boosting RCU readers blocking | ||
378 | * the current grace period, and, if so, tell the rcu_kthread_task to | ||
379 | * start boosting them. If there is an expedited boost in progress, | ||
380 | * we wait for it to complete. | ||
381 | * | ||
382 | * If there are no blocked readers blocking the current grace period, | ||
383 | * return 0 to let the caller know, otherwise return 1. Note that this | ||
384 | * return value is independent of whether or not boosting was done. | ||
385 | */ | ||
386 | static int rcu_initiate_boost(void) | ||
387 | { | ||
388 | if (!rcu_preempt_blocked_readers_cgp() && | ||
389 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
390 | RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); | ||
391 | return 0; | ||
392 | } | ||
393 | if (rcu_preempt_ctrlblk.exp_tasks != NULL || | ||
394 | (rcu_preempt_ctrlblk.gp_tasks != NULL && | ||
395 | rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
396 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { | ||
397 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
398 | rcu_preempt_ctrlblk.boost_tasks = | ||
399 | rcu_preempt_ctrlblk.gp_tasks; | ||
400 | invoke_rcu_callbacks(); | ||
401 | } else { | ||
402 | RCU_TRACE(rcu_initiate_boost_trace()); | ||
403 | } | ||
404 | return 1; | ||
405 | } | ||
406 | |||
407 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
408 | |||
409 | /* | ||
410 | * Do priority-boost accounting for the start of a new grace period. | ||
411 | */ | ||
412 | static void rcu_preempt_boost_start_gp(void) | ||
413 | { | ||
414 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
415 | } | ||
416 | |||
417 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
418 | |||
419 | /* | ||
420 | * If there is no RCU priority boosting, we don't initiate boosting, | ||
421 | * but we do indicate whether there are blocked readers blocking the | ||
422 | * current grace period. | ||
423 | */ | ||
424 | static int rcu_initiate_boost(void) | ||
425 | { | ||
426 | return rcu_preempt_blocked_readers_cgp(); | ||
427 | } | ||
428 | |||
429 | /* | ||
430 | * If there is no RCU priority boosting, nothing to do at grace-period start. | ||
431 | */ | ||
432 | static void rcu_preempt_boost_start_gp(void) | ||
433 | { | ||
434 | } | ||
435 | |||
436 | #endif /* else #ifdef CONFIG_RCU_BOOST */ | ||
437 | |||
438 | /* | ||
439 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
440 | * that this just means that the task currently running on the CPU is | ||
441 | * in a quiescent state. There might be any number of tasks blocked | ||
442 | * while in an RCU read-side critical section. | ||
443 | * | ||
444 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
445 | * must disable irqs in order to protect the assignment to | ||
446 | * ->rcu_read_unlock_special. | ||
447 | * | ||
448 | * Because this is a single-CPU implementation, the only way a grace | ||
449 | * period can end is if the CPU is in a quiescent state. The reason is | ||
450 | * that a blocked preemptible-RCU reader can exit its critical section | ||
451 | * only if the CPU is running it at the time. Therefore, when the | ||
452 | * last task blocking the current grace period exits its RCU read-side | ||
453 | * critical section, neither the CPU nor blocked tasks will be stopping | ||
454 | * the current grace period. (In contrast, SMP implementations | ||
455 | * might have CPUs running in RCU read-side critical sections that | ||
456 | * block later grace periods -- but this is not possible given only | ||
457 | * one CPU.) | ||
458 | */ | ||
459 | static void rcu_preempt_cpu_qs(void) | ||
460 | { | ||
461 | /* Record both CPU and task as having responded to current GP. */ | ||
462 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | ||
463 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
464 | |||
465 | /* If there is no GP then there is nothing more to do. */ | ||
466 | if (!rcu_preempt_gp_in_progress()) | ||
467 | return; | ||
468 | /* | ||
469 | * Check up on boosting. If there are readers blocking the | ||
470 | * current grace period, leave. | ||
471 | */ | ||
472 | if (rcu_initiate_boost()) | ||
473 | return; | ||
474 | |||
475 | /* Advance callbacks. */ | ||
476 | rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; | ||
477 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; | ||
478 | rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; | ||
479 | |||
480 | /* If there are no blocked readers, next GP is done instantly. */ | ||
481 | if (!rcu_preempt_blocked_readers_any()) | ||
482 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | ||
483 | |||
484 | /* If there are done callbacks, cause them to be invoked. */ | ||
485 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | ||
486 | invoke_rcu_callbacks(); | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Start a new RCU grace period if warranted. Hard irqs must be disabled. | ||
491 | */ | ||
492 | static void rcu_preempt_start_gp(void) | ||
493 | { | ||
494 | if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { | ||
495 | |||
496 | /* Official start of GP. */ | ||
497 | rcu_preempt_ctrlblk.gpnum++; | ||
498 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | ||
499 | reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); | ||
500 | |||
501 | /* Any blocked RCU readers block new GP. */ | ||
502 | if (rcu_preempt_blocked_readers_any()) | ||
503 | rcu_preempt_ctrlblk.gp_tasks = | ||
504 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
505 | |||
506 | /* Set up for RCU priority boosting. */ | ||
507 | rcu_preempt_boost_start_gp(); | ||
508 | |||
509 | /* If there is no running reader, CPU is done with GP. */ | ||
510 | if (!rcu_preempt_running_reader()) | ||
511 | rcu_preempt_cpu_qs(); | ||
512 | } | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * We have entered the scheduler, and the current task might soon be | ||
517 | * context-switched away from. If this task is in an RCU read-side | ||
518 | * critical section, we will no longer be able to rely on the CPU to | ||
519 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
520 | * If the task started after the current grace period began, as recorded | ||
521 | * by ->gpcpu, we enqueue at the beginning of the list. Otherwise | ||
522 | * before the element referenced by ->gp_tasks (or at the tail if | ||
523 | * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. | ||
524 | * The task will dequeue itself when it exits the outermost enclosing | ||
525 | * RCU read-side critical section. Therefore, the current grace period | ||
526 | * cannot be permitted to complete until the ->gp_tasks pointer becomes | ||
527 | * NULL. | ||
528 | * | ||
529 | * Caller must disable preemption. | ||
530 | */ | ||
531 | void rcu_preempt_note_context_switch(void) | ||
532 | { | ||
533 | struct task_struct *t = current; | ||
534 | unsigned long flags; | ||
535 | |||
536 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | ||
537 | if (rcu_preempt_running_reader() > 0 && | ||
538 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
539 | |||
540 | /* Possibly blocking in an RCU read-side critical section. */ | ||
541 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
542 | |||
543 | /* | ||
544 | * If this CPU has already checked in, then this task | ||
545 | * will hold up the next grace period rather than the | ||
546 | * current grace period. Queue the task accordingly. | ||
547 | * If the task is queued for the current grace period | ||
548 | * (i.e., this CPU has not yet passed through a quiescent | ||
549 | * state for the current grace period), then as long | ||
550 | * as that task remains queued, the current grace period | ||
551 | * cannot end. | ||
552 | */ | ||
553 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | ||
554 | if (rcu_cpu_blocking_cur_gp()) | ||
555 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | ||
556 | } else if (rcu_preempt_running_reader() < 0 && | ||
557 | t->rcu_read_unlock_special) { | ||
558 | /* | ||
559 | * Complete exit from RCU read-side critical section on | ||
560 | * behalf of preempted instance of __rcu_read_unlock(). | ||
561 | */ | ||
562 | rcu_read_unlock_special(t); | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * Either we were not in an RCU read-side critical section to | ||
567 | * begin with, or we have now recorded that critical section | ||
568 | * globally. Either way, we can now note a quiescent state | ||
569 | * for this CPU. Again, if we were in an RCU read-side critical | ||
570 | * section, and if that critical section was blocking the current | ||
571 | * grace period, then the fact that the task has been enqueued | ||
572 | * means that current grace period continues to be blocked. | ||
573 | */ | ||
574 | rcu_preempt_cpu_qs(); | ||
575 | local_irq_restore(flags); | ||
576 | } | ||
577 | |||
578 | /* | ||
579 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
580 | * notify RCU core processing or task having blocked during the RCU | ||
581 | * read-side critical section. | ||
582 | */ | ||
583 | void rcu_read_unlock_special(struct task_struct *t) | ||
584 | { | ||
585 | int empty; | ||
586 | int empty_exp; | ||
587 | unsigned long flags; | ||
588 | struct list_head *np; | ||
589 | #ifdef CONFIG_RCU_BOOST | ||
590 | struct rt_mutex *rbmp = NULL; | ||
591 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
592 | int special; | ||
593 | |||
594 | /* | ||
595 | * NMI handlers cannot block and cannot safely manipulate state. | ||
596 | * They therefore cannot possibly be special, so just leave. | ||
597 | */ | ||
598 | if (in_nmi()) | ||
599 | return; | ||
600 | |||
601 | local_irq_save(flags); | ||
602 | |||
603 | /* | ||
604 | * If RCU core is waiting for this CPU to exit critical section, | ||
605 | * let it know that we have done so. | ||
606 | */ | ||
607 | special = t->rcu_read_unlock_special; | ||
608 | if (special & RCU_READ_UNLOCK_NEED_QS) | ||
609 | rcu_preempt_cpu_qs(); | ||
610 | |||
611 | /* Hardware IRQ handlers cannot block. */ | ||
612 | if (in_irq() || in_serving_softirq()) { | ||
613 | local_irq_restore(flags); | ||
614 | return; | ||
615 | } | ||
616 | |||
617 | /* Clean up if blocked during RCU read-side critical section. */ | ||
618 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
619 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
620 | |||
621 | /* | ||
622 | * Remove this task from the ->blkd_tasks list and adjust | ||
623 | * any pointers that might have been referencing it. | ||
624 | */ | ||
625 | empty = !rcu_preempt_blocked_readers_cgp(); | ||
626 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | ||
627 | np = rcu_next_node_entry(t); | ||
628 | list_del_init(&t->rcu_node_entry); | ||
629 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | ||
630 | rcu_preempt_ctrlblk.gp_tasks = np; | ||
631 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | ||
632 | rcu_preempt_ctrlblk.exp_tasks = np; | ||
633 | #ifdef CONFIG_RCU_BOOST | ||
634 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | ||
635 | rcu_preempt_ctrlblk.boost_tasks = np; | ||
636 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
637 | |||
638 | /* | ||
639 | * If this was the last task on the current list, and if | ||
640 | * we aren't waiting on the CPU, report the quiescent state | ||
641 | * and start a new grace period if needed. | ||
642 | */ | ||
643 | if (!empty && !rcu_preempt_blocked_readers_cgp()) { | ||
644 | rcu_preempt_cpu_qs(); | ||
645 | rcu_preempt_start_gp(); | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * If this was the last task on the expedited lists, | ||
650 | * then we need wake up the waiting task. | ||
651 | */ | ||
652 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
653 | rcu_report_exp_done(); | ||
654 | } | ||
655 | #ifdef CONFIG_RCU_BOOST | ||
656 | /* Unboost self if was boosted. */ | ||
657 | if (t->rcu_boost_mutex != NULL) { | ||
658 | rbmp = t->rcu_boost_mutex; | ||
659 | t->rcu_boost_mutex = NULL; | ||
660 | rt_mutex_unlock(rbmp); | ||
661 | } | ||
662 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
663 | local_irq_restore(flags); | ||
664 | } | ||
665 | |||
666 | /* | ||
667 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
668 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | ||
669 | * checked elsewhere. This is called from the scheduling-clock interrupt. | ||
670 | * | ||
671 | * Caller must disable hard irqs. | ||
672 | */ | ||
673 | static void rcu_preempt_check_callbacks(void) | ||
674 | { | ||
675 | struct task_struct *t = current; | ||
676 | |||
677 | if (rcu_preempt_gp_in_progress() && | ||
678 | (!rcu_preempt_running_reader() || | ||
679 | !rcu_cpu_blocking_cur_gp())) | ||
680 | rcu_preempt_cpu_qs(); | ||
681 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | ||
682 | rcu_preempt_ctrlblk.rcb.donetail) | ||
683 | invoke_rcu_callbacks(); | ||
684 | if (rcu_preempt_gp_in_progress() && | ||
685 | rcu_cpu_blocking_cur_gp() && | ||
686 | rcu_preempt_running_reader() > 0) | ||
687 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
688 | } | ||
689 | |||
690 | /* | ||
691 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | ||
692 | * update, so this is invoked from rcu_process_callbacks() to | ||
693 | * handle that case. Of course, it is invoked for all flavors of | ||
694 | * RCU, but RCU callbacks can appear only on one of the lists, and | ||
695 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | ||
696 | * is no need for an explicit check. | ||
697 | */ | ||
698 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
699 | { | ||
700 | if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) | ||
701 | rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Process callbacks for preemptible RCU. | ||
706 | */ | ||
707 | static void rcu_preempt_process_callbacks(void) | ||
708 | { | ||
709 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * Queue a preemptible -RCU callback for invocation after a grace period. | ||
714 | */ | ||
715 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
716 | { | ||
717 | unsigned long flags; | ||
718 | |||
719 | debug_rcu_head_queue(head); | ||
720 | head->func = func; | ||
721 | head->next = NULL; | ||
722 | |||
723 | local_irq_save(flags); | ||
724 | *rcu_preempt_ctrlblk.nexttail = head; | ||
725 | rcu_preempt_ctrlblk.nexttail = &head->next; | ||
726 | RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | ||
727 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | ||
728 | local_irq_restore(flags); | ||
729 | } | ||
730 | EXPORT_SYMBOL_GPL(call_rcu); | ||
731 | |||
732 | /* | ||
733 | * synchronize_rcu - wait until a grace period has elapsed. | ||
734 | * | ||
735 | * Control will return to the caller some time after a full grace | ||
736 | * period has elapsed, in other words after all currently executing RCU | ||
737 | * read-side critical sections have completed. RCU read-side critical | ||
738 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
739 | * and may be nested. | ||
740 | */ | ||
741 | void synchronize_rcu(void) | ||
742 | { | ||
743 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
744 | !lock_is_held(&rcu_lock_map) && | ||
745 | !lock_is_held(&rcu_sched_lock_map), | ||
746 | "Illegal synchronize_rcu() in RCU read-side critical section"); | ||
747 | |||
748 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
749 | if (!rcu_scheduler_active) | ||
750 | return; | ||
751 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
752 | |||
753 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
754 | if (!rcu_preempt_blocked_readers_any()) | ||
755 | return; | ||
756 | |||
757 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | ||
758 | if (rcu_expedited) | ||
759 | synchronize_rcu_expedited(); | ||
760 | else | ||
761 | rcu_barrier(); | ||
762 | } | ||
763 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
764 | |||
765 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
766 | static unsigned long sync_rcu_preempt_exp_count; | ||
767 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
768 | |||
769 | /* | ||
770 | * Return non-zero if there are any tasks in RCU read-side critical | ||
771 | * sections blocking the current preemptible-RCU expedited grace period. | ||
772 | * If there is no preemptible-RCU expedited grace period currently in | ||
773 | * progress, returns zero unconditionally. | ||
774 | */ | ||
775 | static int rcu_preempted_readers_exp(void) | ||
776 | { | ||
777 | return rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | * Report the exit from RCU read-side critical section for the last task | ||
782 | * that queued itself during or before the current expedited preemptible-RCU | ||
783 | * grace period. | ||
784 | */ | ||
785 | static void rcu_report_exp_done(void) | ||
786 | { | ||
787 | wake_up(&sync_rcu_preempt_exp_wq); | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | ||
792 | * is to rely in the fact that there is but one CPU, and that it is | ||
793 | * illegal for a task to invoke synchronize_rcu_expedited() while in a | ||
794 | * preemptible-RCU read-side critical section. Therefore, any such | ||
795 | * critical sections must correspond to blocked tasks, which must therefore | ||
796 | * be on the ->blkd_tasks list. So just record the current head of the | ||
797 | * list in the ->exp_tasks pointer, and wait for all tasks including and | ||
798 | * after the task pointed to by ->exp_tasks to drain. | ||
799 | */ | ||
800 | void synchronize_rcu_expedited(void) | ||
801 | { | ||
802 | unsigned long flags; | ||
803 | struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; | ||
804 | unsigned long snap; | ||
805 | |||
806 | barrier(); /* ensure prior action seen before grace period. */ | ||
807 | |||
808 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
809 | |||
810 | /* | ||
811 | * Acquire lock so that there is only one preemptible RCU grace | ||
812 | * period in flight. Of course, if someone does the expedited | ||
813 | * grace period for us while we are acquiring the lock, just leave. | ||
814 | */ | ||
815 | snap = sync_rcu_preempt_exp_count + 1; | ||
816 | mutex_lock(&sync_rcu_preempt_exp_mutex); | ||
817 | if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) | ||
818 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
819 | |||
820 | local_irq_save(flags); | ||
821 | |||
822 | /* | ||
823 | * All RCU readers have to already be on blkd_tasks because | ||
824 | * we cannot legally be executing in an RCU read-side critical | ||
825 | * section. | ||
826 | */ | ||
827 | |||
828 | /* Snapshot current head of ->blkd_tasks list. */ | ||
829 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | ||
830 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | ||
831 | rpcp->exp_tasks = NULL; | ||
832 | |||
833 | /* Wait for tail of ->blkd_tasks list to drain. */ | ||
834 | if (!rcu_preempted_readers_exp()) { | ||
835 | local_irq_restore(flags); | ||
836 | } else { | ||
837 | rcu_initiate_boost(); | ||
838 | local_irq_restore(flags); | ||
839 | wait_event(sync_rcu_preempt_exp_wq, | ||
840 | !rcu_preempted_readers_exp()); | ||
841 | } | ||
842 | |||
843 | /* Clean up and exit. */ | ||
844 | barrier(); /* ensure expedited GP seen before counter increment. */ | ||
845 | sync_rcu_preempt_exp_count++; | ||
846 | unlock_mb_ret: | ||
847 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
848 | barrier(); /* ensure subsequent action seen after grace period. */ | ||
849 | } | ||
850 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
851 | |||
852 | /* | ||
853 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | ||
854 | */ | ||
855 | int rcu_preempt_needs_cpu(void) | ||
856 | { | ||
857 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | ||
858 | } | ||
859 | |||
860 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
861 | |||
862 | #ifdef CONFIG_RCU_TRACE | ||
863 | |||
864 | /* | ||
865 | * Because preemptible RCU does not exist, it is not necessary to | ||
866 | * dump out its statistics. | ||
867 | */ | ||
868 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
869 | { | ||
870 | } | ||
871 | |||
872 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
873 | |||
874 | /* | ||
875 | * Because preemptible RCU does not exist, it never has any callbacks | ||
876 | * to check. | ||
877 | */ | ||
878 | static void rcu_preempt_check_callbacks(void) | ||
879 | { | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * Because preemptible RCU does not exist, it never has any callbacks | ||
884 | * to remove. | ||
885 | */ | ||
886 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
887 | { | ||
888 | } | ||
889 | |||
890 | /* | ||
891 | * Because preemptible RCU does not exist, it never has any callbacks | ||
892 | * to process. | ||
893 | */ | ||
894 | static void rcu_preempt_process_callbacks(void) | ||
895 | { | ||
896 | } | ||
897 | |||
898 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
899 | |||
900 | #ifdef CONFIG_RCU_BOOST | ||
901 | |||
902 | /* | ||
903 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
904 | * or to boost readers. | ||
905 | */ | ||
906 | static void invoke_rcu_callbacks(void) | ||
907 | { | ||
908 | have_rcu_kthread_work = 1; | ||
909 | if (rcu_kthread_task != NULL) | ||
910 | wake_up(&rcu_kthread_wq); | ||
911 | } | ||
912 | |||
913 | #ifdef CONFIG_RCU_TRACE | ||
914 | |||
915 | /* | ||
916 | * Is the current CPU running the RCU-callbacks kthread? | ||
917 | * Caller must have preemption disabled. | ||
918 | */ | ||
919 | static bool rcu_is_callbacks_kthread(void) | ||
920 | { | ||
921 | return rcu_kthread_task == current; | ||
922 | } | ||
923 | |||
924 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
925 | |||
926 | /* | ||
927 | * This kthread invokes RCU callbacks whose grace periods have | ||
928 | * elapsed. It is awakened as needed, and takes the place of the | ||
929 | * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. | ||
930 | * This is a kthread, but it is never stopped, at least not until | ||
931 | * the system goes down. | ||
932 | */ | ||
933 | static int rcu_kthread(void *arg) | ||
934 | { | ||
935 | unsigned long work; | ||
936 | unsigned long morework; | ||
937 | unsigned long flags; | ||
938 | |||
939 | for (;;) { | ||
940 | wait_event_interruptible(rcu_kthread_wq, | ||
941 | have_rcu_kthread_work != 0); | ||
942 | morework = rcu_boost(); | ||
943 | local_irq_save(flags); | ||
944 | work = have_rcu_kthread_work; | ||
945 | have_rcu_kthread_work = morework; | ||
946 | local_irq_restore(flags); | ||
947 | if (work) | ||
948 | rcu_process_callbacks(NULL); | ||
949 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
950 | } | ||
951 | |||
952 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
953 | } | ||
954 | |||
955 | /* | ||
956 | * Spawn the kthread that invokes RCU callbacks. | ||
957 | */ | ||
958 | static int __init rcu_spawn_kthreads(void) | ||
959 | { | ||
960 | struct sched_param sp; | ||
961 | |||
962 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
963 | sp.sched_priority = RCU_BOOST_PRIO; | ||
964 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
965 | return 0; | ||
966 | } | ||
967 | early_initcall(rcu_spawn_kthreads); | ||
968 | |||
969 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
970 | |||
971 | /* Hold off callback invocation until early_initcall() time. */ | ||
972 | static int rcu_scheduler_fully_active __read_mostly; | ||
973 | |||
974 | /* | ||
975 | * Start up softirq processing of callbacks. | ||
976 | */ | ||
977 | void invoke_rcu_callbacks(void) | ||
978 | { | ||
979 | if (rcu_scheduler_fully_active) | ||
980 | raise_softirq(RCU_SOFTIRQ); | ||
981 | } | ||
982 | |||
983 | #ifdef CONFIG_RCU_TRACE | ||
984 | |||
985 | /* | ||
986 | * There is no callback kthread, so this thread is never it. | ||
987 | */ | ||
988 | static bool rcu_is_callbacks_kthread(void) | ||
989 | { | ||
990 | return false; | ||
991 | } | ||
992 | |||
993 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
994 | |||
995 | static int __init rcu_scheduler_really_started(void) | ||
996 | { | ||
997 | rcu_scheduler_fully_active = 1; | ||
998 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
999 | raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ | ||
1000 | return 0; | ||
1001 | } | ||
1002 | early_initcall(rcu_scheduler_really_started); | ||
1003 | |||
1004 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
1005 | |||
1006 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
1007 | #include <linux/kernel_stat.h> | ||
1008 | 60 | ||
1009 | /* | 61 | /* |
1010 | * During boot, we forgive RCU lockdep issues. After this function is | 62 | * During boot, we forgive RCU lockdep issues. After this function is |
@@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void) | |||
1020 | 72 | ||
1021 | #ifdef CONFIG_RCU_TRACE | 73 | #ifdef CONFIG_RCU_TRACE |
1022 | 74 | ||
1023 | #ifdef CONFIG_RCU_BOOST | ||
1024 | |||
1025 | static void rcu_initiate_boost_trace(void) | ||
1026 | { | ||
1027 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
1028 | rcu_preempt_ctrlblk.n_balk_blkd_tasks++; | ||
1029 | else if (rcu_preempt_ctrlblk.gp_tasks == NULL && | ||
1030 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
1031 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; | ||
1032 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | ||
1033 | rcu_preempt_ctrlblk.n_balk_boost_tasks++; | ||
1034 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | ||
1035 | rcu_preempt_ctrlblk.n_balk_notyet++; | ||
1036 | else | ||
1037 | rcu_preempt_ctrlblk.n_balk_nos++; | ||
1038 | } | ||
1039 | |||
1040 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
1041 | |||
1042 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | 75 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) |
1043 | { | 76 | { |
1044 | unsigned long flags; | 77 | unsigned long flags; |
@@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | |||
1053 | */ | 86 | */ |
1054 | static int show_tiny_stats(struct seq_file *m, void *unused) | 87 | static int show_tiny_stats(struct seq_file *m, void *unused) |
1055 | { | 88 | { |
1056 | show_tiny_preempt_stats(m); | ||
1057 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | 89 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); |
1058 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | 90 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); |
1059 | return 0; | 91 | return 0; |
@@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney"); | |||
1103 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | 135 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); |
1104 | MODULE_LICENSE("GPL"); | 136 | MODULE_LICENSE("GPL"); |
1105 | 137 | ||
1106 | static void check_cpu_stall_preempt(void) | 138 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) |
1107 | { | 139 | { |
1108 | #ifdef CONFIG_TINY_PREEMPT_RCU | 140 | unsigned long j; |
1109 | check_cpu_stall(&rcu_preempt_ctrlblk.rcb); | 141 | unsigned long js; |
1110 | #endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 142 | |
143 | if (rcu_cpu_stall_suppress) | ||
144 | return; | ||
145 | rcp->ticks_this_gp++; | ||
146 | j = jiffies; | ||
147 | js = rcp->jiffies_stall; | ||
148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | ||
149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | ||
150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | ||
151 | jiffies - rcp->gp_start, rcp->qlen); | ||
152 | dump_stack(); | ||
153 | } | ||
154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | ||
155 | rcp->jiffies_stall = jiffies + | ||
156 | 3 * rcu_jiffies_till_stall_check() + 3; | ||
157 | else if (ULONG_CMP_GE(j, js)) | ||
158 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
159 | } | ||
160 | |||
161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | ||
162 | { | ||
163 | rcp->ticks_this_gp = 0; | ||
164 | rcp->gp_start = jiffies; | ||
165 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
166 | } | ||
167 | |||
168 | static void check_cpu_stalls(void) | ||
169 | { | ||
170 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | ||
171 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | ||
1111 | } | 172 | } |
1112 | 173 | ||
1113 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 174 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c96724..b1fa5510388d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = { | |||
695 | .name = "srcu_sync" | 695 | .name = "srcu_sync" |
696 | }; | 696 | }; |
697 | 697 | ||
698 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | ||
699 | { | ||
700 | return srcu_read_lock_raw(&srcu_ctl); | ||
701 | } | ||
702 | |||
703 | static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | ||
704 | { | ||
705 | srcu_read_unlock_raw(&srcu_ctl, idx); | ||
706 | } | ||
707 | |||
708 | static struct rcu_torture_ops srcu_raw_ops = { | ||
709 | .init = rcu_sync_torture_init, | ||
710 | .readlock = srcu_torture_read_lock_raw, | ||
711 | .read_delay = srcu_read_delay, | ||
712 | .readunlock = srcu_torture_read_unlock_raw, | ||
713 | .completed = srcu_torture_completed, | ||
714 | .deferred_free = srcu_torture_deferred_free, | ||
715 | .sync = srcu_torture_synchronize, | ||
716 | .call = NULL, | ||
717 | .cb_barrier = NULL, | ||
718 | .stats = srcu_torture_stats, | ||
719 | .name = "srcu_raw" | ||
720 | }; | ||
721 | |||
722 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
723 | .init = rcu_sync_torture_init, | ||
724 | .readlock = srcu_torture_read_lock_raw, | ||
725 | .read_delay = srcu_read_delay, | ||
726 | .readunlock = srcu_torture_read_unlock_raw, | ||
727 | .completed = srcu_torture_completed, | ||
728 | .deferred_free = rcu_sync_torture_deferred_free, | ||
729 | .sync = srcu_torture_synchronize, | ||
730 | .call = NULL, | ||
731 | .cb_barrier = NULL, | ||
732 | .stats = srcu_torture_stats, | ||
733 | .name = "srcu_raw_sync" | ||
734 | }; | ||
735 | |||
736 | static void srcu_torture_synchronize_expedited(void) | 698 | static void srcu_torture_synchronize_expedited(void) |
737 | { | 699 | { |
738 | synchronize_srcu_expedited(&srcu_ctl); | 700 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -1983,7 +1945,6 @@ rcu_torture_init(void) | |||
1983 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1945 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1984 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1946 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1985 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, | 1947 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, |
1986 | &srcu_raw_ops, &srcu_raw_sync_ops, | ||
1987 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1948 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1988 | 1949 | ||
1989 | mutex_lock(&fullstop_mutex); | 1950 | mutex_lock(&fullstop_mutex); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 16ea67925015..cf3adc6fe001 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -218,8 +218,8 @@ module_param(blimit, long, 0444); | |||
218 | module_param(qhimark, long, 0444); | 218 | module_param(qhimark, long, 0444); |
219 | module_param(qlowmark, long, 0444); | 219 | module_param(qlowmark, long, 0444); |
220 | 220 | ||
221 | static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; | 221 | static ulong jiffies_till_first_fqs = ULONG_MAX; |
222 | static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | 222 | static ulong jiffies_till_next_fqs = ULONG_MAX; |
223 | 223 | ||
224 | module_param(jiffies_till_first_fqs, ulong, 0644); | 224 | module_param(jiffies_till_first_fqs, ulong, 0644); |
225 | module_param(jiffies_till_next_fqs, ulong, 0644); | 225 | module_param(jiffies_till_next_fqs, ulong, 0644); |
@@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
866 | * See Documentation/RCU/stallwarn.txt for info on how to debug | 866 | * See Documentation/RCU/stallwarn.txt for info on how to debug |
867 | * RCU CPU stall warnings. | 867 | * RCU CPU stall warnings. |
868 | */ | 868 | */ |
869 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", | 869 | pr_err("INFO: %s detected stalls on CPUs/tasks:", |
870 | rsp->name); | 870 | rsp->name); |
871 | print_cpu_stall_info_begin(); | 871 | print_cpu_stall_info_begin(); |
872 | rcu_for_each_leaf_node(rsp, rnp) { | 872 | rcu_for_each_leaf_node(rsp, rnp) { |
@@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
899 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 899 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
900 | rsp->gpnum, rsp->completed, totqlen); | 900 | rsp->gpnum, rsp->completed, totqlen); |
901 | if (ndetected == 0) | 901 | if (ndetected == 0) |
902 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | 902 | pr_err("INFO: Stall ended before state dump start\n"); |
903 | else if (!trigger_all_cpu_backtrace()) | 903 | else if (!trigger_all_cpu_backtrace()) |
904 | rcu_dump_cpu_stacks(rsp); | 904 | rcu_dump_cpu_stacks(rsp); |
905 | 905 | ||
@@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
922 | * See Documentation/RCU/stallwarn.txt for info on how to debug | 922 | * See Documentation/RCU/stallwarn.txt for info on how to debug |
923 | * RCU CPU stall warnings. | 923 | * RCU CPU stall warnings. |
924 | */ | 924 | */ |
925 | printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); | 925 | pr_err("INFO: %s self-detected stall on CPU", rsp->name); |
926 | print_cpu_stall_info_begin(); | 926 | print_cpu_stall_info_begin(); |
927 | print_cpu_stall_info(rsp, smp_processor_id()); | 927 | print_cpu_stall_info(rsp, smp_processor_id()); |
928 | print_cpu_stall_info_end(); | 928 | print_cpu_stall_info_end(); |
@@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void) | |||
985 | } | 985 | } |
986 | 986 | ||
987 | /* | 987 | /* |
988 | * Update CPU-local rcu_data state to record the newly noticed grace period. | ||
989 | * This is used both when we started the grace period and when we notice | ||
990 | * that someone else started the grace period. The caller must hold the | ||
991 | * ->lock of the leaf rcu_node structure corresponding to the current CPU, | ||
992 | * and must have irqs disabled. | ||
993 | */ | ||
994 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
995 | { | ||
996 | if (rdp->gpnum != rnp->gpnum) { | ||
997 | /* | ||
998 | * If the current grace period is waiting for this CPU, | ||
999 | * set up to detect a quiescent state, otherwise don't | ||
1000 | * go looking for one. | ||
1001 | */ | ||
1002 | rdp->gpnum = rnp->gpnum; | ||
1003 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | ||
1004 | rdp->passed_quiesce = 0; | ||
1005 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | ||
1006 | zero_cpu_stall_ticks(rdp); | ||
1007 | } | ||
1008 | } | ||
1009 | |||
1010 | static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) | ||
1011 | { | ||
1012 | unsigned long flags; | ||
1013 | struct rcu_node *rnp; | ||
1014 | |||
1015 | local_irq_save(flags); | ||
1016 | rnp = rdp->mynode; | ||
1017 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ | ||
1018 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ | ||
1019 | local_irq_restore(flags); | ||
1020 | return; | ||
1021 | } | ||
1022 | __note_new_gpnum(rsp, rnp, rdp); | ||
1023 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1024 | } | ||
1025 | |||
1026 | /* | ||
1027 | * Did someone else start a new RCU grace period start since we last | ||
1028 | * checked? Update local state appropriately if so. Must be called | ||
1029 | * on the CPU corresponding to rdp. | ||
1030 | */ | ||
1031 | static int | ||
1032 | check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) | ||
1033 | { | ||
1034 | unsigned long flags; | ||
1035 | int ret = 0; | ||
1036 | |||
1037 | local_irq_save(flags); | ||
1038 | if (rdp->gpnum != rsp->gpnum) { | ||
1039 | note_new_gpnum(rsp, rdp); | ||
1040 | ret = 1; | ||
1041 | } | ||
1042 | local_irq_restore(flags); | ||
1043 | return ret; | ||
1044 | } | ||
1045 | |||
1046 | /* | ||
1047 | * Initialize the specified rcu_data structure's callback list to empty. | 988 | * Initialize the specified rcu_data structure's callback list to empty. |
1048 | */ | 989 | */ |
1049 | static void init_callback_list(struct rcu_data *rdp) | 990 | static void init_callback_list(struct rcu_data *rdp) |
@@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1313 | } | 1254 | } |
1314 | 1255 | ||
1315 | /* | 1256 | /* |
1316 | * Advance this CPU's callbacks, but only if the current grace period | 1257 | * Update CPU-local rcu_data state to record the beginnings and ends of |
1317 | * has ended. This may be called only from the CPU to whom the rdp | 1258 | * grace periods. The caller must hold the ->lock of the leaf rcu_node |
1318 | * belongs. In addition, the corresponding leaf rcu_node structure's | 1259 | * structure corresponding to the current CPU, and must have irqs disabled. |
1319 | * ->lock must be held by the caller, with irqs disabled. | ||
1320 | */ | 1260 | */ |
1321 | static void | 1261 | static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
1322 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
1323 | { | 1262 | { |
1324 | /* Did another grace period end? */ | 1263 | /* Handle the ends of any preceding grace periods first. */ |
1325 | if (rdp->completed == rnp->completed) { | 1264 | if (rdp->completed == rnp->completed) { |
1326 | 1265 | ||
1327 | /* No, so just accelerate recent callbacks. */ | 1266 | /* No grace period end, so just accelerate recent callbacks. */ |
1328 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1267 | rcu_accelerate_cbs(rsp, rnp, rdp); |
1329 | 1268 | ||
1330 | } else { | 1269 | } else { |
@@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
1335 | /* Remember that we saw this grace-period completion. */ | 1274 | /* Remember that we saw this grace-period completion. */ |
1336 | rdp->completed = rnp->completed; | 1275 | rdp->completed = rnp->completed; |
1337 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | 1276 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); |
1277 | } | ||
1338 | 1278 | ||
1279 | if (rdp->gpnum != rnp->gpnum) { | ||
1339 | /* | 1280 | /* |
1340 | * If we were in an extended quiescent state, we may have | 1281 | * If the current grace period is waiting for this CPU, |
1341 | * missed some grace periods that others CPUs handled on | 1282 | * set up to detect a quiescent state, otherwise don't |
1342 | * our behalf. Catch up with this state to avoid noting | 1283 | * go looking for one. |
1343 | * spurious new grace periods. If another grace period | ||
1344 | * has started, then rnp->gpnum will have advanced, so | ||
1345 | * we will detect this later on. Of course, any quiescent | ||
1346 | * states we found for the old GP are now invalid. | ||
1347 | */ | ||
1348 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { | ||
1349 | rdp->gpnum = rdp->completed; | ||
1350 | rdp->passed_quiesce = 0; | ||
1351 | } | ||
1352 | |||
1353 | /* | ||
1354 | * If RCU does not need a quiescent state from this CPU, | ||
1355 | * then make sure that this CPU doesn't go looking for one. | ||
1356 | */ | 1284 | */ |
1357 | if ((rnp->qsmask & rdp->grpmask) == 0) | 1285 | rdp->gpnum = rnp->gpnum; |
1358 | rdp->qs_pending = 0; | 1286 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); |
1287 | rdp->passed_quiesce = 0; | ||
1288 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | ||
1289 | zero_cpu_stall_ticks(rdp); | ||
1359 | } | 1290 | } |
1360 | } | 1291 | } |
1361 | 1292 | ||
1362 | /* | 1293 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) |
1363 | * Advance this CPU's callbacks, but only if the current grace period | ||
1364 | * has ended. This may be called only from the CPU to whom the rdp | ||
1365 | * belongs. | ||
1366 | */ | ||
1367 | static void | ||
1368 | rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | ||
1369 | { | 1294 | { |
1370 | unsigned long flags; | 1295 | unsigned long flags; |
1371 | struct rcu_node *rnp; | 1296 | struct rcu_node *rnp; |
1372 | 1297 | ||
1373 | local_irq_save(flags); | 1298 | local_irq_save(flags); |
1374 | rnp = rdp->mynode; | 1299 | rnp = rdp->mynode; |
1375 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ | 1300 | if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && |
1301 | rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ | ||
1376 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ | 1302 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
1377 | local_irq_restore(flags); | 1303 | local_irq_restore(flags); |
1378 | return; | 1304 | return; |
1379 | } | 1305 | } |
1380 | __rcu_process_gp_end(rsp, rnp, rdp); | 1306 | __note_gp_changes(rsp, rnp, rdp); |
1381 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1307 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1382 | } | 1308 | } |
1383 | 1309 | ||
1384 | /* | 1310 | /* |
1385 | * Do per-CPU grace-period initialization for running CPU. The caller | ||
1386 | * must hold the lock of the leaf rcu_node structure corresponding to | ||
1387 | * this CPU. | ||
1388 | */ | ||
1389 | static void | ||
1390 | rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
1391 | { | ||
1392 | /* Prior grace period ended, so advance callbacks for current CPU. */ | ||
1393 | __rcu_process_gp_end(rsp, rnp, rdp); | ||
1394 | |||
1395 | /* Set state so that this CPU will detect the next quiescent state. */ | ||
1396 | __note_new_gpnum(rsp, rnp, rdp); | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1400 | * Initialize a new grace period. | 1311 | * Initialize a new grace period. |
1401 | */ | 1312 | */ |
1402 | static int rcu_gp_init(struct rcu_state *rsp) | 1313 | static int rcu_gp_init(struct rcu_state *rsp) |
@@ -1444,16 +1355,16 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1444 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1355 | WARN_ON_ONCE(rnp->completed != rsp->completed); |
1445 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 1356 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
1446 | if (rnp == rdp->mynode) | 1357 | if (rnp == rdp->mynode) |
1447 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1358 | __note_gp_changes(rsp, rnp, rdp); |
1448 | rcu_preempt_boost_start_gp(rnp); | 1359 | rcu_preempt_boost_start_gp(rnp); |
1449 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 1360 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
1450 | rnp->level, rnp->grplo, | 1361 | rnp->level, rnp->grplo, |
1451 | rnp->grphi, rnp->qsmask); | 1362 | rnp->grphi, rnp->qsmask); |
1452 | raw_spin_unlock_irq(&rnp->lock); | 1363 | raw_spin_unlock_irq(&rnp->lock); |
1453 | #ifdef CONFIG_PROVE_RCU_DELAY | 1364 | #ifdef CONFIG_PROVE_RCU_DELAY |
1454 | if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && | 1365 | if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && |
1455 | system_state == SYSTEM_RUNNING) | 1366 | system_state == SYSTEM_RUNNING) |
1456 | schedule_timeout_uninterruptible(2); | 1367 | udelay(200); |
1457 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | 1368 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ |
1458 | cond_resched(); | 1369 | cond_resched(); |
1459 | } | 1370 | } |
@@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1527 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1438 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1528 | rdp = this_cpu_ptr(rsp->rda); | 1439 | rdp = this_cpu_ptr(rsp->rda); |
1529 | if (rnp == rdp->mynode) | 1440 | if (rnp == rdp->mynode) |
1530 | __rcu_process_gp_end(rsp, rnp, rdp); | 1441 | __note_gp_changes(rsp, rnp, rdp); |
1531 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1442 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
1532 | raw_spin_unlock_irq(&rnp->lock); | 1443 | raw_spin_unlock_irq(&rnp->lock); |
1533 | cond_resched(); | 1444 | cond_resched(); |
@@ -1613,6 +1524,14 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1613 | } | 1524 | } |
1614 | } | 1525 | } |
1615 | 1526 | ||
1527 | static void rsp_wakeup(struct irq_work *work) | ||
1528 | { | ||
1529 | struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); | ||
1530 | |||
1531 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
1532 | wake_up(&rsp->gp_wq); | ||
1533 | } | ||
1534 | |||
1616 | /* | 1535 | /* |
1617 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1536 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
1618 | * in preparation for detecting the next grace period. The caller must hold | 1537 | * in preparation for detecting the next grace period. The caller must hold |
@@ -1637,8 +1556,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1637 | } | 1556 | } |
1638 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1557 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1639 | 1558 | ||
1640 | /* Wake up rcu_gp_kthread() to start the grace period. */ | 1559 | /* |
1641 | wake_up(&rsp->gp_wq); | 1560 | * We can't do wakeups while holding the rnp->lock, as that |
1561 | * could cause possible deadlocks with the rq->lock. Deter | ||
1562 | * the wakeup to interrupt context. | ||
1563 | */ | ||
1564 | irq_work_queue(&rsp->wakeup_work); | ||
1642 | } | 1565 | } |
1643 | 1566 | ||
1644 | /* | 1567 | /* |
@@ -1793,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
1793 | static void | 1716 | static void |
1794 | rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | 1717 | rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) |
1795 | { | 1718 | { |
1796 | /* If there is now a new grace period, record and return. */ | 1719 | /* Check for grace-period ends and beginnings. */ |
1797 | if (check_for_new_grace_period(rsp, rdp)) | 1720 | note_gp_changes(rsp, rdp); |
1798 | return; | ||
1799 | 1721 | ||
1800 | /* | 1722 | /* |
1801 | * Does this CPU still need to do its part for current grace period? | 1723 | * Does this CPU still need to do its part for current grace period? |
@@ -2259,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2259 | 2181 | ||
2260 | WARN_ON_ONCE(rdp->beenonline == 0); | 2182 | WARN_ON_ONCE(rdp->beenonline == 0); |
2261 | 2183 | ||
2262 | /* Handle the end of a grace period that some other CPU ended. */ | ||
2263 | rcu_process_gp_end(rsp, rdp); | ||
2264 | |||
2265 | /* Update RCU state based on any recent quiescent states. */ | 2184 | /* Update RCU state based on any recent quiescent states. */ |
2266 | rcu_check_quiescent_state(rsp, rdp); | 2185 | rcu_check_quiescent_state(rsp, rdp); |
2267 | 2186 | ||
@@ -2346,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2346 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 2265 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
2347 | 2266 | ||
2348 | /* Are we ignoring a completed grace period? */ | 2267 | /* Are we ignoring a completed grace period? */ |
2349 | rcu_process_gp_end(rsp, rdp); | 2268 | note_gp_changes(rsp, rdp); |
2350 | check_for_new_grace_period(rsp, rdp); | ||
2351 | 2269 | ||
2352 | /* Start a new grace period if one not already started. */ | 2270 | /* Start a new grace period if one not already started. */ |
2353 | if (!rcu_gp_in_progress(rsp)) { | 2271 | if (!rcu_gp_in_progress(rsp)) { |
@@ -3235,6 +3153,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3235 | 3153 | ||
3236 | rsp->rda = rda; | 3154 | rsp->rda = rda; |
3237 | init_waitqueue_head(&rsp->gp_wq); | 3155 | init_waitqueue_head(&rsp->gp_wq); |
3156 | init_irq_work(&rsp->wakeup_work, rsp_wakeup); | ||
3238 | rnp = rsp->level[rcu_num_lvls - 1]; | 3157 | rnp = rsp->level[rcu_num_lvls - 1]; |
3239 | for_each_possible_cpu(i) { | 3158 | for_each_possible_cpu(i) { |
3240 | while (i > rnp->grphi) | 3159 | while (i > rnp->grphi) |
@@ -3252,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3252 | */ | 3171 | */ |
3253 | static void __init rcu_init_geometry(void) | 3172 | static void __init rcu_init_geometry(void) |
3254 | { | 3173 | { |
3174 | ulong d; | ||
3255 | int i; | 3175 | int i; |
3256 | int j; | 3176 | int j; |
3257 | int n = nr_cpu_ids; | 3177 | int n = nr_cpu_ids; |
3258 | int rcu_capacity[MAX_RCU_LVLS + 1]; | 3178 | int rcu_capacity[MAX_RCU_LVLS + 1]; |
3259 | 3179 | ||
3180 | /* | ||
3181 | * Initialize any unspecified boot parameters. | ||
3182 | * The default values of jiffies_till_first_fqs and | ||
3183 | * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS | ||
3184 | * value, which is a function of HZ, then adding one for each | ||
3185 | * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. | ||
3186 | */ | ||
3187 | d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; | ||
3188 | if (jiffies_till_first_fqs == ULONG_MAX) | ||
3189 | jiffies_till_first_fqs = d; | ||
3190 | if (jiffies_till_next_fqs == ULONG_MAX) | ||
3191 | jiffies_till_next_fqs = d; | ||
3192 | |||
3260 | /* If the compile-time values are accurate, just leave. */ | 3193 | /* If the compile-time values are accurate, just leave. */ |
3261 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && | 3194 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && |
3262 | nr_cpu_ids == NR_CPUS) | 3195 | nr_cpu_ids == NR_CPUS) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index da77a8f57ff9..4a39d364493c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/threads.h> | 27 | #include <linux/threads.h> |
28 | #include <linux/cpumask.h> | 28 | #include <linux/cpumask.h> |
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | #include <linux/irq_work.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | 33 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
@@ -342,12 +343,17 @@ struct rcu_data { | |||
342 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 343 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
343 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 344 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
344 | 345 | ||
345 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 346 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) |
347 | /* For jiffies_till_first_fqs and */ | ||
348 | /* and jiffies_till_next_fqs. */ | ||
346 | 349 | ||
347 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 350 | #define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ |
348 | /* to take at least one */ | 351 | /* delay between bouts of */ |
349 | /* scheduling clock irq */ | 352 | /* quiescent-state forcing. */ |
350 | /* before ratting on them. */ | 353 | |
354 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ | ||
355 | /* at least one scheduling clock */ | ||
356 | /* irq before ratting on them. */ | ||
351 | 357 | ||
352 | #define rcu_wait(cond) \ | 358 | #define rcu_wait(cond) \ |
353 | do { \ | 359 | do { \ |
@@ -442,6 +448,7 @@ struct rcu_state { | |||
442 | char *name; /* Name of structure. */ | 448 | char *name; /* Name of structure. */ |
443 | char abbr; /* Abbreviated name. */ | 449 | char abbr; /* Abbreviated name. */ |
444 | struct list_head flavors; /* List of RCU flavors. */ | 450 | struct list_head flavors; /* List of RCU flavors. */ |
451 | struct irq_work wakeup_work; /* Postponed wakeups */ | ||
445 | }; | 452 | }; |
446 | 453 | ||
447 | /* Values for rcu_state structure's gp_flags field. */ | 454 | /* Values for rcu_state structure's gp_flags field. */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8dd..63098a59216e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5]; | |||
53 | static void __init rcu_bootup_announce_oddness(void) | 53 | static void __init rcu_bootup_announce_oddness(void) |
54 | { | 54 | { |
55 | #ifdef CONFIG_RCU_TRACE | 55 | #ifdef CONFIG_RCU_TRACE |
56 | printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); | 56 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); |
57 | #endif | 57 | #endif |
58 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) | 58 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) |
59 | printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | 59 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", |
60 | CONFIG_RCU_FANOUT); | 60 | CONFIG_RCU_FANOUT); |
61 | #endif | 61 | #endif |
62 | #ifdef CONFIG_RCU_FANOUT_EXACT | 62 | #ifdef CONFIG_RCU_FANOUT_EXACT |
63 | printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); | 63 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); |
64 | #endif | 64 | #endif |
65 | #ifdef CONFIG_RCU_FAST_NO_HZ | 65 | #ifdef CONFIG_RCU_FAST_NO_HZ |
66 | printk(KERN_INFO | 66 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); |
67 | "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | ||
68 | #endif | 67 | #endif |
69 | #ifdef CONFIG_PROVE_RCU | 68 | #ifdef CONFIG_PROVE_RCU |
70 | printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); | 69 | pr_info("\tRCU lockdep checking is enabled.\n"); |
71 | #endif | 70 | #endif |
72 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 71 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
73 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 72 | pr_info("\tRCU torture testing starts during boot.\n"); |
74 | #endif | 73 | #endif |
75 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | 74 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
76 | printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); | 75 | pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); |
77 | #endif | 76 | #endif |
78 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | 77 | #if defined(CONFIG_RCU_CPU_STALL_INFO) |
79 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); | 78 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); |
80 | #endif | 79 | #endif |
81 | #if NUM_RCU_LVL_4 != 0 | 80 | #if NUM_RCU_LVL_4 != 0 |
82 | printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); | 81 | pr_info("\tFour-level hierarchy is enabled.\n"); |
83 | #endif | 82 | #endif |
84 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | 83 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) |
85 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 84 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
86 | if (nr_cpu_ids != NR_CPUS) | 85 | if (nr_cpu_ids != NR_CPUS) |
87 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 86 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
88 | #ifdef CONFIG_RCU_NOCB_CPU | 87 | #ifdef CONFIG_RCU_NOCB_CPU |
89 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | 88 | #ifndef CONFIG_RCU_NOCB_CPU_NONE |
90 | if (!have_rcu_nocb_mask) { | 89 | if (!have_rcu_nocb_mask) { |
@@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void) | |||
92 | have_rcu_nocb_mask = true; | 91 | have_rcu_nocb_mask = true; |
93 | } | 92 | } |
94 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | 93 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO |
95 | pr_info("\tExperimental no-CBs CPU 0\n"); | 94 | pr_info("\tOffload RCU callbacks from CPU 0\n"); |
96 | cpumask_set_cpu(0, rcu_nocb_mask); | 95 | cpumask_set_cpu(0, rcu_nocb_mask); |
97 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ |
98 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL |
99 | pr_info("\tExperimental no-CBs for all CPUs\n"); | 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); |
100 | cpumask_setall(rcu_nocb_mask); | 99 | cpumask_setall(rcu_nocb_mask); |
101 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ |
102 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ |
103 | if (have_rcu_nocb_mask) { | 102 | if (have_rcu_nocb_mask) { |
104 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 103 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
105 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | 104 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); |
106 | if (rcu_nocb_poll) | 105 | if (rcu_nocb_poll) |
107 | pr_info("\tExperimental polled no-CBs CPUs.\n"); | 106 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); |
108 | } | 107 | } |
109 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 108 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
110 | } | 109 | } |
@@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); | |||
123 | */ | 122 | */ |
124 | static void __init rcu_bootup_announce(void) | 123 | static void __init rcu_bootup_announce(void) |
125 | { | 124 | { |
126 | printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); | 125 | pr_info("Preemptible hierarchical RCU implementation.\n"); |
127 | rcu_bootup_announce_oddness(); | 126 | rcu_bootup_announce_oddness(); |
128 | } | 127 | } |
129 | 128 | ||
@@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
490 | 489 | ||
491 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | 490 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) |
492 | { | 491 | { |
493 | printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | 492 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", |
494 | rnp->level, rnp->grplo, rnp->grphi); | 493 | rnp->level, rnp->grplo, rnp->grphi); |
495 | } | 494 | } |
496 | 495 | ||
497 | static void rcu_print_task_stall_end(void) | 496 | static void rcu_print_task_stall_end(void) |
498 | { | 497 | { |
499 | printk(KERN_CONT "\n"); | 498 | pr_cont("\n"); |
500 | } | 499 | } |
501 | 500 | ||
502 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 501 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
@@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
526 | t = list_entry(rnp->gp_tasks, | 525 | t = list_entry(rnp->gp_tasks, |
527 | struct task_struct, rcu_node_entry); | 526 | struct task_struct, rcu_node_entry); |
528 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | 527 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
529 | printk(KERN_CONT " P%d", t->pid); | 528 | pr_cont(" P%d", t->pid); |
530 | ndetected++; | 529 | ndetected++; |
531 | } | 530 | } |
532 | rcu_print_task_stall_end(); | 531 | rcu_print_task_stall_end(); |
@@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void) | |||
933 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 932 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
934 | } | 933 | } |
935 | 934 | ||
935 | /* | ||
936 | * Check for a task exiting while in a preemptible-RCU read-side | ||
937 | * critical section, clean up if so. No need to issue warnings, | ||
938 | * as debug_check_no_locks_held() already does this if lockdep | ||
939 | * is enabled. | ||
940 | */ | ||
941 | void exit_rcu(void) | ||
942 | { | ||
943 | struct task_struct *t = current; | ||
944 | |||
945 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
946 | return; | ||
947 | t->rcu_read_lock_nesting = 1; | ||
948 | barrier(); | ||
949 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
950 | __rcu_read_unlock(); | ||
951 | } | ||
952 | |||
936 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 953 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
937 | 954 | ||
938 | static struct rcu_state *rcu_state = &rcu_sched_state; | 955 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state; | |||
942 | */ | 959 | */ |
943 | static void __init rcu_bootup_announce(void) | 960 | static void __init rcu_bootup_announce(void) |
944 | { | 961 | { |
945 | printk(KERN_INFO "Hierarchical RCU implementation.\n"); | 962 | pr_info("Hierarchical RCU implementation.\n"); |
946 | rcu_bootup_announce_oddness(); | 963 | rcu_bootup_announce_oddness(); |
947 | } | 964 | } |
948 | 965 | ||
@@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void) | |||
1101 | { | 1118 | { |
1102 | } | 1119 | } |
1103 | 1120 | ||
1121 | /* | ||
1122 | * Because preemptible RCU does not exist, tasks cannot possibly exit | ||
1123 | * while in preemptible RCU read-side critical sections. | ||
1124 | */ | ||
1125 | void exit_rcu(void) | ||
1126 | { | ||
1127 | } | ||
1128 | |||
1104 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1129 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1105 | 1130 | ||
1106 | #ifdef CONFIG_RCU_BOOST | 1131 | #ifdef CONFIG_RCU_BOOST |
@@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void) | |||
1629 | */ | 1654 | */ |
1630 | if (rdp->completed != rnp->completed && | 1655 | if (rdp->completed != rnp->completed && |
1631 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | 1656 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) |
1632 | rcu_process_gp_end(rsp, rdp); | 1657 | note_gp_changes(rsp, rdp); |
1633 | 1658 | ||
1634 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1659 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1635 | cbs_ready = true; | 1660 | cbs_ready = true; |
@@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | |||
1883 | /* Initiate the stall-info list. */ | 1908 | /* Initiate the stall-info list. */ |
1884 | static void print_cpu_stall_info_begin(void) | 1909 | static void print_cpu_stall_info_begin(void) |
1885 | { | 1910 | { |
1886 | printk(KERN_CONT "\n"); | 1911 | pr_cont("\n"); |
1887 | } | 1912 | } |
1888 | 1913 | ||
1889 | /* | 1914 | /* |
@@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
1914 | ticks_value = rsp->gpnum - rdp->gpnum; | 1939 | ticks_value = rsp->gpnum - rdp->gpnum; |
1915 | } | 1940 | } |
1916 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | 1941 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); |
1917 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", | 1942 | pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", |
1918 | cpu, ticks_value, ticks_title, | 1943 | cpu, ticks_value, ticks_title, |
1919 | atomic_read(&rdtp->dynticks) & 0xfff, | 1944 | atomic_read(&rdtp->dynticks) & 0xfff, |
1920 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | 1945 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, |
@@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
1925 | /* Terminate the stall-info list. */ | 1950 | /* Terminate the stall-info list. */ |
1926 | static void print_cpu_stall_info_end(void) | 1951 | static void print_cpu_stall_info_end(void) |
1927 | { | 1952 | { |
1928 | printk(KERN_ERR "\t"); | 1953 | pr_err("\t"); |
1929 | } | 1954 | } |
1930 | 1955 | ||
1931 | /* Zero ->ticks_this_gp for all flavors of RCU. */ | 1956 | /* Zero ->ticks_this_gp for all flavors of RCU. */ |
@@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void) | |||
1948 | 1973 | ||
1949 | static void print_cpu_stall_info_begin(void) | 1974 | static void print_cpu_stall_info_begin(void) |
1950 | { | 1975 | { |
1951 | printk(KERN_CONT " {"); | 1976 | pr_cont(" {"); |
1952 | } | 1977 | } |
1953 | 1978 | ||
1954 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | 1979 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) |
1955 | { | 1980 | { |
1956 | printk(KERN_CONT " %d", cpu); | 1981 | pr_cont(" %d", cpu); |
1957 | } | 1982 | } |
1958 | 1983 | ||
1959 | static void print_cpu_stall_info_end(void) | 1984 | static void print_cpu_stall_info_end(void) |
1960 | { | 1985 | { |
1961 | printk(KERN_CONT "} "); | 1986 | pr_cont("} "); |
1962 | } | 1987 | } |
1963 | 1988 | ||
1964 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | 1989 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) |
diff --git a/kernel/resource.c b/kernel/resource.c index d7386986e10e..77bf11a86c7d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn) | |||
409 | { | 409 | { |
410 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 410 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
411 | } | 411 | } |
412 | EXPORT_SYMBOL_GPL(page_is_ram); | ||
412 | 413 | ||
413 | void __weak arch_remove_reservations(struct resource *avail) | 414 | void __weak arch_remove_reservations(struct resource *avail) |
414 | { | 415 | { |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a1..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -145,6 +145,19 @@ int max_lock_depth = 1024; | |||
145 | /* | 145 | /* |
146 | * Adjust the priority chain. Also used for deadlock detection. | 146 | * Adjust the priority chain. Also used for deadlock detection. |
147 | * Decreases task's usage by one - may thus free the task. | 147 | * Decreases task's usage by one - may thus free the task. |
148 | * | ||
149 | * @task: the task owning the mutex (owner) for which a chain walk is probably | ||
150 | * needed | ||
151 | * @deadlock_detect: do we have to carry out deadlock detection? | ||
152 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | ||
153 | * things for a task that has just got its priority adjusted, and | ||
154 | * is waiting on a mutex) | ||
155 | * @orig_waiter: rt_mutex_waiter struct for the task that has just donated | ||
156 | * its priority to the mutex owner (can be NULL in the case | ||
157 | * depicted above or if the top waiter is gone away and we are | ||
158 | * actually deboosting the owner) | ||
159 | * @top_task: the current top waiter | ||
160 | * | ||
148 | * Returns 0 or -EDEADLK. | 161 | * Returns 0 or -EDEADLK. |
149 | */ | 162 | */ |
150 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, | 163 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) | |||
77 | if (IS_ERR(tg)) | 77 | if (IS_ERR(tg)) |
78 | goto out_free; | 78 | goto out_free; |
79 | 79 | ||
80 | sched_online_group(tg, &root_task_group); | ||
81 | |||
82 | kref_init(&ag->kref); | 80 | kref_init(&ag->kref); |
83 | init_rwsem(&ag->lock); | 81 | init_rwsem(&ag->lock); |
84 | ag->id = atomic_inc_return(&autogroup_seq_nr); | 82 | ag->id = atomic_inc_return(&autogroup_seq_nr); |
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) | |||
98 | #endif | 96 | #endif |
99 | tg->autogroup = ag; | 97 | tg->autogroup = ag; |
100 | 98 | ||
99 | sched_online_group(tg, &root_task_group); | ||
101 | return ag; | 100 | return ag; |
102 | 101 | ||
103 | out_free: | 102 | out_free: |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..9b1f2e533b95 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu) | |||
633 | static inline bool got_nohz_idle_kick(void) | 633 | static inline bool got_nohz_idle_kick(void) |
634 | { | 634 | { |
635 | int cpu = smp_processor_id(); | 635 | int cpu = smp_processor_id(); |
636 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 636 | |
637 | if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) | ||
638 | return false; | ||
639 | |||
640 | if (idle_cpu(cpu) && !need_resched()) | ||
641 | return true; | ||
642 | |||
643 | /* | ||
644 | * We can't run Idle Load Balance on this CPU for this time so we | ||
645 | * cancel it and clear NOHZ_BALANCE_KICK | ||
646 | */ | ||
647 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | ||
648 | return false; | ||
637 | } | 649 | } |
638 | 650 | ||
639 | #else /* CONFIG_NO_HZ_COMMON */ | 651 | #else /* CONFIG_NO_HZ_COMMON */ |
@@ -667,7 +679,7 @@ void sched_avg_update(struct rq *rq) | |||
667 | { | 679 | { |
668 | s64 period = sched_avg_period(); | 680 | s64 period = sched_avg_period(); |
669 | 681 | ||
670 | while ((s64)(rq->clock - rq->age_stamp) > period) { | 682 | while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { |
671 | /* | 683 | /* |
672 | * Inline assembly required to prevent the compiler | 684 | * Inline assembly required to prevent the compiler |
673 | * optimising this loop into a divmod call. | 685 | * optimising this loop into a divmod call. |
@@ -1328,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
1328 | p->sched_class->task_woken(rq, p); | 1340 | p->sched_class->task_woken(rq, p); |
1329 | 1341 | ||
1330 | if (rq->idle_stamp) { | 1342 | if (rq->idle_stamp) { |
1331 | u64 delta = rq->clock - rq->idle_stamp; | 1343 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
1332 | u64 max = 2*sysctl_sched_migration_cost; | 1344 | u64 max = 2*sysctl_sched_migration_cost; |
1333 | 1345 | ||
1334 | if (delta > max) | 1346 | if (delta > max) |
@@ -1365,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1365 | 1377 | ||
1366 | rq = __task_rq_lock(p); | 1378 | rq = __task_rq_lock(p); |
1367 | if (p->on_rq) { | 1379 | if (p->on_rq) { |
1380 | /* check_preempt_curr() may use rq clock */ | ||
1381 | update_rq_clock(rq); | ||
1368 | ttwu_do_wakeup(rq, p, wake_flags); | 1382 | ttwu_do_wakeup(rq, p, wake_flags); |
1369 | ret = 1; | 1383 | ret = 1; |
1370 | } | 1384 | } |
@@ -1393,8 +1407,9 @@ static void sched_ttwu_pending(void) | |||
1393 | 1407 | ||
1394 | void scheduler_ipi(void) | 1408 | void scheduler_ipi(void) |
1395 | { | 1409 | { |
1396 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() | 1410 | if (llist_empty(&this_rq()->wake_list) |
1397 | && !tick_nohz_full_cpu(smp_processor_id())) | 1411 | && !tick_nohz_full_cpu(smp_processor_id()) |
1412 | && !got_nohz_idle_kick()) | ||
1398 | return; | 1413 | return; |
1399 | 1414 | ||
1400 | /* | 1415 | /* |
@@ -1417,7 +1432,7 @@ void scheduler_ipi(void) | |||
1417 | /* | 1432 | /* |
1418 | * Check if someone kicked us for doing the nohz idle load balance. | 1433 | * Check if someone kicked us for doing the nohz idle load balance. |
1419 | */ | 1434 | */ |
1420 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | 1435 | if (unlikely(got_nohz_idle_kick())) { |
1421 | this_rq()->idle_balance = 1; | 1436 | this_rq()->idle_balance = 1; |
1422 | raise_softirq_irqoff(SCHED_SOFTIRQ); | 1437 | raise_softirq_irqoff(SCHED_SOFTIRQ); |
1423 | } | 1438 | } |
@@ -1596,15 +1611,6 @@ static void __sched_fork(struct task_struct *p) | |||
1596 | p->se.vruntime = 0; | 1611 | p->se.vruntime = 0; |
1597 | INIT_LIST_HEAD(&p->se.group_node); | 1612 | INIT_LIST_HEAD(&p->se.group_node); |
1598 | 1613 | ||
1599 | /* | ||
1600 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
1601 | * removed when useful for applications beyond shares distribution (e.g. | ||
1602 | * load-balance). | ||
1603 | */ | ||
1604 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1605 | p->se.avg.runnable_avg_period = 0; | ||
1606 | p->se.avg.runnable_avg_sum = 0; | ||
1607 | #endif | ||
1608 | #ifdef CONFIG_SCHEDSTATS | 1614 | #ifdef CONFIG_SCHEDSTATS |
1609 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1615 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1610 | #endif | 1616 | #endif |
@@ -1748,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p) | |||
1748 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1754 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
1749 | #endif | 1755 | #endif |
1750 | 1756 | ||
1757 | /* Initialize new task's runnable average */ | ||
1758 | init_task_runnable_average(p); | ||
1751 | rq = __task_rq_lock(p); | 1759 | rq = __task_rq_lock(p); |
1752 | activate_task(rq, p, 0); | 1760 | activate_task(rq, p, 0); |
1753 | p->on_rq = 1; | 1761 | p->on_rq = 1; |
@@ -2056,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu) | |||
2056 | return atomic_read(&this->nr_iowait); | 2064 | return atomic_read(&this->nr_iowait); |
2057 | } | 2065 | } |
2058 | 2066 | ||
2059 | unsigned long this_cpu_load(void) | ||
2060 | { | ||
2061 | struct rq *this = this_rq(); | ||
2062 | return this->cpu_load[0]; | ||
2063 | } | ||
2064 | |||
2065 | |||
2066 | /* | ||
2067 | * Global load-average calculations | ||
2068 | * | ||
2069 | * We take a distributed and async approach to calculating the global load-avg | ||
2070 | * in order to minimize overhead. | ||
2071 | * | ||
2072 | * The global load average is an exponentially decaying average of nr_running + | ||
2073 | * nr_uninterruptible. | ||
2074 | * | ||
2075 | * Once every LOAD_FREQ: | ||
2076 | * | ||
2077 | * nr_active = 0; | ||
2078 | * for_each_possible_cpu(cpu) | ||
2079 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2080 | * | ||
2081 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2082 | * | ||
2083 | * Due to a number of reasons the above turns in the mess below: | ||
2084 | * | ||
2085 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2086 | * serious number of cpus, therefore we need to take a distributed approach | ||
2087 | * to calculating nr_active. | ||
2088 | * | ||
2089 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2090 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2091 | * | ||
2092 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2093 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2094 | * to obtain the same result. See calc_load_fold_active(). | ||
2095 | * | ||
2096 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2097 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2098 | * cpu to have completed this task. | ||
2099 | * | ||
2100 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2101 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2102 | * | ||
2103 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2104 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2105 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2106 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2107 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2108 | * all cpus yields the correct result. | ||
2109 | * | ||
2110 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2111 | */ | ||
2112 | |||
2113 | /* Variables and functions for calc_load */ | ||
2114 | static atomic_long_t calc_load_tasks; | ||
2115 | static unsigned long calc_load_update; | ||
2116 | unsigned long avenrun[3]; | ||
2117 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
2118 | |||
2119 | /** | ||
2120 | * get_avenrun - get the load average array | ||
2121 | * @loads: pointer to dest load array | ||
2122 | * @offset: offset to add | ||
2123 | * @shift: shift count to shift the result left | ||
2124 | * | ||
2125 | * These values are estimates at best, so no need for locking. | ||
2126 | */ | ||
2127 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2128 | { | ||
2129 | loads[0] = (avenrun[0] + offset) << shift; | ||
2130 | loads[1] = (avenrun[1] + offset) << shift; | ||
2131 | loads[2] = (avenrun[2] + offset) << shift; | ||
2132 | } | ||
2133 | |||
2134 | static long calc_load_fold_active(struct rq *this_rq) | ||
2135 | { | ||
2136 | long nr_active, delta = 0; | ||
2137 | |||
2138 | nr_active = this_rq->nr_running; | ||
2139 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2140 | |||
2141 | if (nr_active != this_rq->calc_load_active) { | ||
2142 | delta = nr_active - this_rq->calc_load_active; | ||
2143 | this_rq->calc_load_active = nr_active; | ||
2144 | } | ||
2145 | |||
2146 | return delta; | ||
2147 | } | ||
2148 | |||
2149 | /* | ||
2150 | * a1 = a0 * e + a * (1 - e) | ||
2151 | */ | ||
2152 | static unsigned long | ||
2153 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2154 | { | ||
2155 | load *= exp; | ||
2156 | load += active * (FIXED_1 - exp); | ||
2157 | load += 1UL << (FSHIFT - 1); | ||
2158 | return load >> FSHIFT; | ||
2159 | } | ||
2160 | |||
2161 | #ifdef CONFIG_NO_HZ_COMMON | ||
2162 | /* | ||
2163 | * Handle NO_HZ for the global load-average. | ||
2164 | * | ||
2165 | * Since the above described distributed algorithm to compute the global | ||
2166 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2167 | * NO_HZ. | ||
2168 | * | ||
2169 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2170 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2171 | * when we read the global state. | ||
2172 | * | ||
2173 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2174 | * | ||
2175 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2176 | * contribution, causing under-accounting. | ||
2177 | * | ||
2178 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2179 | * when the window starts, thus separating old and new NO_HZ load. | ||
2180 | * | ||
2181 | * The only trick is the slight shift in index flip for read vs write. | ||
2182 | * | ||
2183 | * 0s 5s 10s 15s | ||
2184 | * +10 +10 +10 +10 | ||
2185 | * |-|-----------|-|-----------|-|-----------|-| | ||
2186 | * r:0 0 1 1 0 0 1 1 0 | ||
2187 | * w:0 1 1 0 0 1 1 0 0 | ||
2188 | * | ||
2189 | * This ensures we'll fold the old idle contribution in this window while | ||
2190 | * accumlating the new one. | ||
2191 | * | ||
2192 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2193 | * contribution, since we effectively move our sample point to a known | ||
2194 | * busy state. | ||
2195 | * | ||
2196 | * This is solved by pushing the window forward, and thus skipping the | ||
2197 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2198 | * was in effect at the time the window opened). This also solves the issue | ||
2199 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2200 | * LOAD_FREQ intervals. | ||
2201 | * | ||
2202 | * When making the ILB scale, we should try to pull this in as well. | ||
2203 | */ | ||
2204 | static atomic_long_t calc_load_idle[2]; | ||
2205 | static int calc_load_idx; | ||
2206 | |||
2207 | static inline int calc_load_write_idx(void) | ||
2208 | { | ||
2209 | int idx = calc_load_idx; | ||
2210 | |||
2211 | /* | ||
2212 | * See calc_global_nohz(), if we observe the new index, we also | ||
2213 | * need to observe the new update time. | ||
2214 | */ | ||
2215 | smp_rmb(); | ||
2216 | |||
2217 | /* | ||
2218 | * If the folding window started, make sure we start writing in the | ||
2219 | * next idle-delta. | ||
2220 | */ | ||
2221 | if (!time_before(jiffies, calc_load_update)) | ||
2222 | idx++; | ||
2223 | |||
2224 | return idx & 1; | ||
2225 | } | ||
2226 | |||
2227 | static inline int calc_load_read_idx(void) | ||
2228 | { | ||
2229 | return calc_load_idx & 1; | ||
2230 | } | ||
2231 | |||
2232 | void calc_load_enter_idle(void) | ||
2233 | { | ||
2234 | struct rq *this_rq = this_rq(); | ||
2235 | long delta; | ||
2236 | |||
2237 | /* | ||
2238 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2239 | * into the pending idle delta. | ||
2240 | */ | ||
2241 | delta = calc_load_fold_active(this_rq); | ||
2242 | if (delta) { | ||
2243 | int idx = calc_load_write_idx(); | ||
2244 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2245 | } | ||
2246 | } | ||
2247 | |||
2248 | void calc_load_exit_idle(void) | ||
2249 | { | ||
2250 | struct rq *this_rq = this_rq(); | ||
2251 | |||
2252 | /* | ||
2253 | * If we're still before the sample window, we're done. | ||
2254 | */ | ||
2255 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2256 | return; | ||
2257 | |||
2258 | /* | ||
2259 | * We woke inside or after the sample window, this means we're already | ||
2260 | * accounted through the nohz accounting, so skip the entire deal and | ||
2261 | * sync up for the next window. | ||
2262 | */ | ||
2263 | this_rq->calc_load_update = calc_load_update; | ||
2264 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
2265 | this_rq->calc_load_update += LOAD_FREQ; | ||
2266 | } | ||
2267 | |||
2268 | static long calc_load_fold_idle(void) | ||
2269 | { | ||
2270 | int idx = calc_load_read_idx(); | ||
2271 | long delta = 0; | ||
2272 | |||
2273 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2274 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2275 | |||
2276 | return delta; | ||
2277 | } | ||
2278 | |||
2279 | /** | ||
2280 | * fixed_power_int - compute: x^n, in O(log n) time | ||
2281 | * | ||
2282 | * @x: base of the power | ||
2283 | * @frac_bits: fractional bits of @x | ||
2284 | * @n: power to raise @x to. | ||
2285 | * | ||
2286 | * By exploiting the relation between the definition of the natural power | ||
2287 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
2288 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
2289 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
2290 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
2291 | * of course trivially computable in O(log_2 n), the length of our binary | ||
2292 | * vector. | ||
2293 | */ | ||
2294 | static unsigned long | ||
2295 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
2296 | { | ||
2297 | unsigned long result = 1UL << frac_bits; | ||
2298 | |||
2299 | if (n) for (;;) { | ||
2300 | if (n & 1) { | ||
2301 | result *= x; | ||
2302 | result += 1UL << (frac_bits - 1); | ||
2303 | result >>= frac_bits; | ||
2304 | } | ||
2305 | n >>= 1; | ||
2306 | if (!n) | ||
2307 | break; | ||
2308 | x *= x; | ||
2309 | x += 1UL << (frac_bits - 1); | ||
2310 | x >>= frac_bits; | ||
2311 | } | ||
2312 | |||
2313 | return result; | ||
2314 | } | ||
2315 | |||
2316 | /* | ||
2317 | * a1 = a0 * e + a * (1 - e) | ||
2318 | * | ||
2319 | * a2 = a1 * e + a * (1 - e) | ||
2320 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
2321 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
2322 | * | ||
2323 | * a3 = a2 * e + a * (1 - e) | ||
2324 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
2325 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
2326 | * | ||
2327 | * ... | ||
2328 | * | ||
2329 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
2330 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
2331 | * = a0 * e^n + a * (1 - e^n) | ||
2332 | * | ||
2333 | * [1] application of the geometric series: | ||
2334 | * | ||
2335 | * n 1 - x^(n+1) | ||
2336 | * S_n := \Sum x^i = ------------- | ||
2337 | * i=0 1 - x | ||
2338 | */ | ||
2339 | static unsigned long | ||
2340 | calc_load_n(unsigned long load, unsigned long exp, | ||
2341 | unsigned long active, unsigned int n) | ||
2342 | { | ||
2343 | |||
2344 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
2345 | } | ||
2346 | |||
2347 | /* | ||
2348 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
2349 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
2350 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
2351 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
2352 | * | ||
2353 | * Once we've updated the global active value, we need to apply the exponential | ||
2354 | * weights adjusted to the number of cycles missed. | ||
2355 | */ | ||
2356 | static void calc_global_nohz(void) | ||
2357 | { | ||
2358 | long delta, active, n; | ||
2359 | |||
2360 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
2361 | /* | ||
2362 | * Catch-up, fold however many we are behind still | ||
2363 | */ | ||
2364 | delta = jiffies - calc_load_update - 10; | ||
2365 | n = 1 + (delta / LOAD_FREQ); | ||
2366 | |||
2367 | active = atomic_long_read(&calc_load_tasks); | ||
2368 | active = active > 0 ? active * FIXED_1 : 0; | ||
2369 | |||
2370 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
2371 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
2372 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2373 | |||
2374 | calc_load_update += n * LOAD_FREQ; | ||
2375 | } | ||
2376 | |||
2377 | /* | ||
2378 | * Flip the idle index... | ||
2379 | * | ||
2380 | * Make sure we first write the new time then flip the index, so that | ||
2381 | * calc_load_write_idx() will see the new time when it reads the new | ||
2382 | * index, this avoids a double flip messing things up. | ||
2383 | */ | ||
2384 | smp_wmb(); | ||
2385 | calc_load_idx++; | ||
2386 | } | ||
2387 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
2388 | |||
2389 | static inline long calc_load_fold_idle(void) { return 0; } | ||
2390 | static inline void calc_global_nohz(void) { } | ||
2391 | |||
2392 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2393 | |||
2394 | /* | ||
2395 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
2396 | * CPUs have updated calc_load_tasks. | ||
2397 | */ | ||
2398 | void calc_global_load(unsigned long ticks) | ||
2399 | { | ||
2400 | long active, delta; | ||
2401 | |||
2402 | if (time_before(jiffies, calc_load_update + 10)) | ||
2403 | return; | ||
2404 | |||
2405 | /* | ||
2406 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2407 | */ | ||
2408 | delta = calc_load_fold_idle(); | ||
2409 | if (delta) | ||
2410 | atomic_long_add(delta, &calc_load_tasks); | ||
2411 | |||
2412 | active = atomic_long_read(&calc_load_tasks); | ||
2413 | active = active > 0 ? active * FIXED_1 : 0; | ||
2414 | |||
2415 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
2416 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2417 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2418 | |||
2419 | calc_load_update += LOAD_FREQ; | ||
2420 | |||
2421 | /* | ||
2422 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
2423 | */ | ||
2424 | calc_global_nohz(); | ||
2425 | } | ||
2426 | |||
2427 | /* | ||
2428 | * Called from update_cpu_load() to periodically update this CPU's | ||
2429 | * active count. | ||
2430 | */ | ||
2431 | static void calc_load_account_active(struct rq *this_rq) | ||
2432 | { | ||
2433 | long delta; | ||
2434 | |||
2435 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2436 | return; | ||
2437 | |||
2438 | delta = calc_load_fold_active(this_rq); | ||
2439 | if (delta) | ||
2440 | atomic_long_add(delta, &calc_load_tasks); | ||
2441 | |||
2442 | this_rq->calc_load_update += LOAD_FREQ; | ||
2443 | } | ||
2444 | |||
2445 | /* | ||
2446 | * End of global load-average stuff | ||
2447 | */ | ||
2448 | |||
2449 | /* | ||
2450 | * The exact cpuload at various idx values, calculated at every tick would be | ||
2451 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
2452 | * | ||
2453 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
2454 | * on nth tick when cpu may be busy, then we have: | ||
2455 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2456 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
2457 | * | ||
2458 | * decay_load_missed() below does efficient calculation of | ||
2459 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2460 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
2461 | * | ||
2462 | * The calculation is approximated on a 128 point scale. | ||
2463 | * degrade_zero_ticks is the number of ticks after which load at any | ||
2464 | * particular idx is approximated to be zero. | ||
2465 | * degrade_factor is a precomputed table, a row for each load idx. | ||
2466 | * Each column corresponds to degradation factor for a power of two ticks, | ||
2467 | * based on 128 point scale. | ||
2468 | * Example: | ||
2469 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
2470 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
2471 | * | ||
2472 | * With this power of 2 load factors, we can degrade the load n times | ||
2473 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
2474 | * n mult/shifts needed by the exact degradation. | ||
2475 | */ | ||
2476 | #define DEGRADE_SHIFT 7 | ||
2477 | static const unsigned char | ||
2478 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
2479 | static const unsigned char | ||
2480 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
2481 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
2482 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
2483 | {96, 72, 40, 12, 1, 0, 0}, | ||
2484 | {112, 98, 75, 43, 15, 1, 0}, | ||
2485 | {120, 112, 98, 76, 45, 16, 2} }; | ||
2486 | |||
2487 | /* | ||
2488 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
2489 | * would be when CPU is idle and so we just decay the old load without | ||
2490 | * adding any new load. | ||
2491 | */ | ||
2492 | static unsigned long | ||
2493 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
2494 | { | ||
2495 | int j = 0; | ||
2496 | |||
2497 | if (!missed_updates) | ||
2498 | return load; | ||
2499 | |||
2500 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
2501 | return 0; | ||
2502 | |||
2503 | if (idx == 1) | ||
2504 | return load >> missed_updates; | ||
2505 | |||
2506 | while (missed_updates) { | ||
2507 | if (missed_updates % 2) | ||
2508 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
2509 | |||
2510 | missed_updates >>= 1; | ||
2511 | j++; | ||
2512 | } | ||
2513 | return load; | ||
2514 | } | ||
2515 | |||
2516 | /* | ||
2517 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
2518 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
2519 | * every tick. We fix it up based on jiffies. | ||
2520 | */ | ||
2521 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
2522 | unsigned long pending_updates) | ||
2523 | { | ||
2524 | int i, scale; | ||
2525 | |||
2526 | this_rq->nr_load_updates++; | ||
2527 | |||
2528 | /* Update our load: */ | ||
2529 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
2530 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
2531 | unsigned long old_load, new_load; | ||
2532 | |||
2533 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
2534 | |||
2535 | old_load = this_rq->cpu_load[i]; | ||
2536 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
2537 | new_load = this_load; | ||
2538 | /* | ||
2539 | * Round up the averaging division if load is increasing. This | ||
2540 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2541 | * example. | ||
2542 | */ | ||
2543 | if (new_load > old_load) | ||
2544 | new_load += scale - 1; | ||
2545 | |||
2546 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
2547 | } | ||
2548 | |||
2549 | sched_avg_update(this_rq); | ||
2550 | } | ||
2551 | |||
2552 | #ifdef CONFIG_NO_HZ_COMMON | ||
2553 | /* | ||
2554 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2555 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2556 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2557 | * | ||
2558 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2559 | * would seriously skew the load calculation. However we'll make do for those | ||
2560 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2561 | * (tick_nohz_idle_exit). | ||
2562 | * | ||
2563 | * This means we might still be one tick off for nohz periods. | ||
2564 | */ | ||
2565 | |||
2566 | /* | ||
2567 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2568 | * idle balance. | ||
2569 | */ | ||
2570 | void update_idle_cpu_load(struct rq *this_rq) | ||
2571 | { | ||
2572 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2573 | unsigned long load = this_rq->load.weight; | ||
2574 | unsigned long pending_updates; | ||
2575 | |||
2576 | /* | ||
2577 | * bail if there's load or we're actually up-to-date. | ||
2578 | */ | ||
2579 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2580 | return; | ||
2581 | |||
2582 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2583 | this_rq->last_load_update_tick = curr_jiffies; | ||
2584 | |||
2585 | __update_cpu_load(this_rq, load, pending_updates); | ||
2586 | } | ||
2587 | |||
2588 | /* | ||
2589 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2590 | */ | ||
2591 | void update_cpu_load_nohz(void) | ||
2592 | { | ||
2593 | struct rq *this_rq = this_rq(); | ||
2594 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2595 | unsigned long pending_updates; | ||
2596 | |||
2597 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2598 | return; | ||
2599 | |||
2600 | raw_spin_lock(&this_rq->lock); | ||
2601 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2602 | if (pending_updates) { | ||
2603 | this_rq->last_load_update_tick = curr_jiffies; | ||
2604 | /* | ||
2605 | * We were idle, this means load 0, the current load might be | ||
2606 | * !0 due to remote wakeups and the sort. | ||
2607 | */ | ||
2608 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2609 | } | ||
2610 | raw_spin_unlock(&this_rq->lock); | ||
2611 | } | ||
2612 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2613 | |||
2614 | /* | ||
2615 | * Called from scheduler_tick() | ||
2616 | */ | ||
2617 | static void update_cpu_load_active(struct rq *this_rq) | ||
2618 | { | ||
2619 | /* | ||
2620 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2621 | */ | ||
2622 | this_rq->last_load_update_tick = jiffies; | ||
2623 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2624 | |||
2625 | calc_load_account_active(this_rq); | ||
2626 | } | ||
2627 | |||
2628 | #ifdef CONFIG_SMP | 2067 | #ifdef CONFIG_SMP |
2629 | 2068 | ||
2630 | /* | 2069 | /* |
@@ -2673,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
2673 | 2112 | ||
2674 | if (task_current(rq, p)) { | 2113 | if (task_current(rq, p)) { |
2675 | update_rq_clock(rq); | 2114 | update_rq_clock(rq); |
2676 | ns = rq->clock_task - p->se.exec_start; | 2115 | ns = rq_clock_task(rq) - p->se.exec_start; |
2677 | if ((s64)ns < 0) | 2116 | if ((s64)ns < 0) |
2678 | ns = 0; | 2117 | ns = 0; |
2679 | } | 2118 | } |
@@ -2726,8 +2165,8 @@ void scheduler_tick(void) | |||
2726 | 2165 | ||
2727 | raw_spin_lock(&rq->lock); | 2166 | raw_spin_lock(&rq->lock); |
2728 | update_rq_clock(rq); | 2167 | update_rq_clock(rq); |
2729 | update_cpu_load_active(rq); | ||
2730 | curr->sched_class->task_tick(rq, curr, 0); | 2168 | curr->sched_class->task_tick(rq, curr, 0); |
2169 | update_cpu_load_active(rq); | ||
2731 | raw_spin_unlock(&rq->lock); | 2170 | raw_spin_unlock(&rq->lock); |
2732 | 2171 | ||
2733 | perf_event_task_tick(); | 2172 | perf_event_task_tick(); |
@@ -4745,7 +4184,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4745 | */ | 4184 | */ |
4746 | idle->sched_class = &idle_sched_class; | 4185 | idle->sched_class = &idle_sched_class; |
4747 | ftrace_graph_init_idle_task(idle, cpu); | 4186 | ftrace_graph_init_idle_task(idle, cpu); |
4748 | vtime_init_idle(idle); | 4187 | vtime_init_idle(idle, cpu); |
4749 | #if defined(CONFIG_SMP) | 4188 | #if defined(CONFIG_SMP) |
4750 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | 4189 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); |
4751 | #endif | 4190 | #endif |
@@ -4947,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
4947 | */ | 4386 | */ |
4948 | rq->stop = NULL; | 4387 | rq->stop = NULL; |
4949 | 4388 | ||
4389 | /* | ||
4390 | * put_prev_task() and pick_next_task() sched | ||
4391 | * class method both need to have an up-to-date | ||
4392 | * value of rq->clock[_task] | ||
4393 | */ | ||
4394 | update_rq_clock(rq); | ||
4395 | |||
4950 | for ( ; ; ) { | 4396 | for ( ; ; ) { |
4951 | /* | 4397 | /* |
4952 | * There's this thread running, bail when that's the only | 4398 | * There's this thread running, bail when that's the only |
@@ -5080,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5080 | return table; | 4526 | return table; |
5081 | } | 4527 | } |
5082 | 4528 | ||
5083 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 4529 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5084 | { | 4530 | { |
5085 | struct ctl_table *entry, *table; | 4531 | struct ctl_table *entry, *table; |
5086 | struct sched_domain *sd; | 4532 | struct sched_domain *sd; |
@@ -5894,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5894 | get_group(cpu, sdd, &sd->groups); | 5340 | get_group(cpu, sdd, &sd->groups); |
5895 | atomic_inc(&sd->groups->ref); | 5341 | atomic_inc(&sd->groups->ref); |
5896 | 5342 | ||
5897 | if (cpu != cpumask_first(sched_domain_span(sd))) | 5343 | if (cpu != cpumask_first(span)) |
5898 | return 0; | 5344 | return 0; |
5899 | 5345 | ||
5900 | lockdep_assert_held(&sched_domains_mutex); | 5346 | lockdep_assert_held(&sched_domains_mutex); |
@@ -5904,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5904 | 5350 | ||
5905 | for_each_cpu(i, span) { | 5351 | for_each_cpu(i, span) { |
5906 | struct sched_group *sg; | 5352 | struct sched_group *sg; |
5907 | int group = get_group(i, sdd, &sg); | 5353 | int group, j; |
5908 | int j; | ||
5909 | 5354 | ||
5910 | if (cpumask_test_cpu(i, covered)) | 5355 | if (cpumask_test_cpu(i, covered)) |
5911 | continue; | 5356 | continue; |
5912 | 5357 | ||
5358 | group = get_group(i, sdd, &sg); | ||
5913 | cpumask_clear(sched_group_cpus(sg)); | 5359 | cpumask_clear(sched_group_cpus(sg)); |
5914 | sg->sgp->power = 0; | 5360 | sg->sgp->power = 0; |
5915 | cpumask_setall(sched_group_mask(sg)); | 5361 | cpumask_setall(sched_group_mask(sg)); |
@@ -5947,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
5947 | { | 5393 | { |
5948 | struct sched_group *sg = sd->groups; | 5394 | struct sched_group *sg = sd->groups; |
5949 | 5395 | ||
5950 | WARN_ON(!sd || !sg); | 5396 | WARN_ON(!sg); |
5951 | 5397 | ||
5952 | do { | 5398 | do { |
5953 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | 5399 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); |
@@ -6112,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6112 | 5558 | ||
6113 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 5559 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6114 | 5560 | ||
5561 | #define for_each_sd_topology(tl) \ | ||
5562 | for (tl = sched_domain_topology; tl->init; tl++) | ||
5563 | |||
6115 | #ifdef CONFIG_NUMA | 5564 | #ifdef CONFIG_NUMA |
6116 | 5565 | ||
6117 | static int sched_domains_numa_levels; | 5566 | static int sched_domains_numa_levels; |
@@ -6409,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6409 | struct sched_domain_topology_level *tl; | 5858 | struct sched_domain_topology_level *tl; |
6410 | int j; | 5859 | int j; |
6411 | 5860 | ||
6412 | for (tl = sched_domain_topology; tl->init; tl++) { | 5861 | for_each_sd_topology(tl) { |
6413 | struct sd_data *sdd = &tl->data; | 5862 | struct sd_data *sdd = &tl->data; |
6414 | 5863 | ||
6415 | sdd->sd = alloc_percpu(struct sched_domain *); | 5864 | sdd->sd = alloc_percpu(struct sched_domain *); |
@@ -6462,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6462 | struct sched_domain_topology_level *tl; | 5911 | struct sched_domain_topology_level *tl; |
6463 | int j; | 5912 | int j; |
6464 | 5913 | ||
6465 | for (tl = sched_domain_topology; tl->init; tl++) { | 5914 | for_each_sd_topology(tl) { |
6466 | struct sd_data *sdd = &tl->data; | 5915 | struct sd_data *sdd = &tl->data; |
6467 | 5916 | ||
6468 | for_each_cpu(j, cpu_map) { | 5917 | for_each_cpu(j, cpu_map) { |
@@ -6490,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6490 | } | 5939 | } |
6491 | 5940 | ||
6492 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | 5941 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
6493 | struct s_data *d, const struct cpumask *cpu_map, | 5942 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6494 | struct sched_domain_attr *attr, struct sched_domain *child, | 5943 | struct sched_domain *child, int cpu) |
6495 | int cpu) | ||
6496 | { | 5944 | { |
6497 | struct sched_domain *sd = tl->init(tl, cpu); | 5945 | struct sched_domain *sd = tl->init(tl, cpu); |
6498 | if (!sd) | 5946 | if (!sd) |
@@ -6503,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6503 | sd->level = child->level + 1; | 5951 | sd->level = child->level + 1; |
6504 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 5952 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6505 | child->parent = sd; | 5953 | child->parent = sd; |
5954 | sd->child = child; | ||
6506 | } | 5955 | } |
6507 | sd->child = child; | ||
6508 | set_domain_attribute(sd, attr); | 5956 | set_domain_attribute(sd, attr); |
6509 | 5957 | ||
6510 | return sd; | 5958 | return sd; |
@@ -6517,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6517 | static int build_sched_domains(const struct cpumask *cpu_map, | 5965 | static int build_sched_domains(const struct cpumask *cpu_map, |
6518 | struct sched_domain_attr *attr) | 5966 | struct sched_domain_attr *attr) |
6519 | { | 5967 | { |
6520 | enum s_alloc alloc_state = sa_none; | 5968 | enum s_alloc alloc_state; |
6521 | struct sched_domain *sd; | 5969 | struct sched_domain *sd; |
6522 | struct s_data d; | 5970 | struct s_data d; |
6523 | int i, ret = -ENOMEM; | 5971 | int i, ret = -ENOMEM; |
@@ -6531,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6531 | struct sched_domain_topology_level *tl; | 5979 | struct sched_domain_topology_level *tl; |
6532 | 5980 | ||
6533 | sd = NULL; | 5981 | sd = NULL; |
6534 | for (tl = sched_domain_topology; tl->init; tl++) { | 5982 | for_each_sd_topology(tl) { |
6535 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | 5983 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); |
5984 | if (tl == sched_domain_topology) | ||
5985 | *per_cpu_ptr(d.sd, i) = sd; | ||
6536 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | 5986 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
6537 | sd->flags |= SD_OVERLAP; | 5987 | sd->flags |= SD_OVERLAP; |
6538 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | 5988 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) |
6539 | break; | 5989 | break; |
6540 | } | 5990 | } |
6541 | |||
6542 | while (sd->child) | ||
6543 | sd = sd->child; | ||
6544 | |||
6545 | *per_cpu_ptr(d.sd, i) = sd; | ||
6546 | } | 5991 | } |
6547 | 5992 | ||
6548 | /* Build the groups for the domains */ | 5993 | /* Build the groups for the domains */ |
@@ -6854,9 +6299,6 @@ void __init sched_init_smp(void) | |||
6854 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6299 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
6855 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | 6300 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
6856 | 6301 | ||
6857 | /* RT runtime code needs to handle some hotplug events */ | ||
6858 | hotcpu_notifier(update_runtime, 0); | ||
6859 | |||
6860 | init_hrtick(); | 6302 | init_hrtick(); |
6861 | 6303 | ||
6862 | /* Move init over to a non-isolated CPU */ | 6304 | /* Move init over to a non-isolated CPU */ |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..a7959e05a9d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | |||
515 | 515 | ||
516 | for (;;) { | 516 | for (;;) { |
517 | /* Make sure "rtime" is the bigger of stime/rtime */ | 517 | /* Make sure "rtime" is the bigger of stime/rtime */ |
518 | if (stime > rtime) { | 518 | if (stime > rtime) |
519 | u64 tmp = rtime; rtime = stime; stime = tmp; | 519 | swap(rtime, stime); |
520 | } | ||
521 | 520 | ||
522 | /* Make sure 'total' fits in 32 bits */ | 521 | /* Make sure 'total' fits in 32 bits */ |
523 | if (total >> 32) | 522 | if (total >> 32) |
@@ -747,17 +746,17 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
747 | 746 | ||
748 | write_seqlock(¤t->vtime_seqlock); | 747 | write_seqlock(¤t->vtime_seqlock); |
749 | current->vtime_snap_whence = VTIME_SYS; | 748 | current->vtime_snap_whence = VTIME_SYS; |
750 | current->vtime_snap = sched_clock(); | 749 | current->vtime_snap = sched_clock_cpu(smp_processor_id()); |
751 | write_sequnlock(¤t->vtime_seqlock); | 750 | write_sequnlock(¤t->vtime_seqlock); |
752 | } | 751 | } |
753 | 752 | ||
754 | void vtime_init_idle(struct task_struct *t) | 753 | void vtime_init_idle(struct task_struct *t, int cpu) |
755 | { | 754 | { |
756 | unsigned long flags; | 755 | unsigned long flags; |
757 | 756 | ||
758 | write_seqlock_irqsave(&t->vtime_seqlock, flags); | 757 | write_seqlock_irqsave(&t->vtime_seqlock, flags); |
759 | t->vtime_snap_whence = VTIME_SYS; | 758 | t->vtime_snap_whence = VTIME_SYS; |
760 | t->vtime_snap = sched_clock(); | 759 | t->vtime_snap = sched_clock_cpu(cpu); |
761 | write_sequnlock_irqrestore(&t->vtime_seqlock, flags); | 760 | write_sequnlock_irqrestore(&t->vtime_seqlock, flags); |
762 | } | 761 | } |
763 | 762 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a673520..e076bddd4c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
209 | cfs_rq->nr_spread_over); | 209 | cfs_rq->nr_spread_over); |
210 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); | 210 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
211 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 211 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
212 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
213 | #ifdef CONFIG_SMP | 212 | #ifdef CONFIG_SMP |
214 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", | 213 | SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", |
215 | cfs_rq->runnable_load_avg); | 214 | cfs_rq->runnable_load_avg); |
216 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", | 215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
217 | cfs_rq->blocked_load_avg); | 216 | cfs_rq->blocked_load_avg); |
218 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
219 | (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); | 218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
220 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", | ||
221 | cfs_rq->tg_load_contrib); | 219 | cfs_rq->tg_load_contrib); |
222 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | 220 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", |
223 | cfs_rq->tg_runnable_contrib); | 221 | cfs_rq->tg_runnable_contrib); |
222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", | ||
223 | atomic_long_read(&cfs_rq->tg->load_avg)); | ||
224 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | 224 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", |
225 | atomic_read(&cfs_rq->tg->runnable_avg)); | 225 | atomic_read(&cfs_rq->tg->runnable_avg)); |
226 | #endif | 226 | #endif |
227 | #endif | ||
227 | 228 | ||
229 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
228 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 230 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
229 | #endif | 231 | #endif |
230 | } | 232 | } |
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
493 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, | 495 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, |
494 | get_nr_threads(p)); | 496 | get_nr_threads(p)); |
495 | SEQ_printf(m, | 497 | SEQ_printf(m, |
496 | "---------------------------------------------------------\n"); | 498 | "---------------------------------------------------------" |
499 | "----------\n"); | ||
497 | #define __P(F) \ | 500 | #define __P(F) \ |
498 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) | 501 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
499 | #define P(F) \ | 502 | #define P(F) \ |
500 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) | 503 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
501 | #define __PN(F) \ | 504 | #define __PN(F) \ |
502 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | 505 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
503 | #define PN(F) \ | 506 | #define PN(F) \ |
504 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | 507 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
505 | 508 | ||
506 | PN(se.exec_start); | 509 | PN(se.exec_start); |
507 | PN(se.vruntime); | 510 | PN(se.vruntime); |
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
560 | } | 563 | } |
561 | #endif | 564 | #endif |
562 | __P(nr_switches); | 565 | __P(nr_switches); |
563 | SEQ_printf(m, "%-35s:%21Ld\n", | 566 | SEQ_printf(m, "%-45s:%21Ld\n", |
564 | "nr_voluntary_switches", (long long)p->nvcsw); | 567 | "nr_voluntary_switches", (long long)p->nvcsw); |
565 | SEQ_printf(m, "%-35s:%21Ld\n", | 568 | SEQ_printf(m, "%-45s:%21Ld\n", |
566 | "nr_involuntary_switches", (long long)p->nivcsw); | 569 | "nr_involuntary_switches", (long long)p->nivcsw); |
567 | 570 | ||
568 | P(se.load.weight); | 571 | P(se.load.weight); |
572 | #ifdef CONFIG_SMP | ||
573 | P(se.avg.runnable_avg_sum); | ||
574 | P(se.avg.runnable_avg_period); | ||
575 | P(se.avg.load_avg_contrib); | ||
576 | P(se.avg.decay_count); | ||
577 | #endif | ||
569 | P(policy); | 578 | P(policy); |
570 | P(prio); | 579 | P(prio); |
571 | #undef PN | 580 | #undef PN |
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
579 | 588 | ||
580 | t0 = cpu_clock(this_cpu); | 589 | t0 = cpu_clock(this_cpu); |
581 | t1 = cpu_clock(this_cpu); | 590 | t1 = cpu_clock(this_cpu); |
582 | SEQ_printf(m, "%-35s:%21Ld\n", | 591 | SEQ_printf(m, "%-45s:%21Ld\n", |
583 | "clock-delta", (long long)(t1-t0)); | 592 | "clock-delta", (long long)(t1-t0)); |
584 | } | 593 | } |
585 | } | 594 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..f77f9c527449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
113 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 113 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
114 | #endif | 114 | #endif |
115 | 115 | ||
116 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
117 | { | ||
118 | lw->weight += inc; | ||
119 | lw->inv_weight = 0; | ||
120 | } | ||
121 | |||
122 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
123 | { | ||
124 | lw->weight -= dec; | ||
125 | lw->inv_weight = 0; | ||
126 | } | ||
127 | |||
128 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
129 | { | ||
130 | lw->weight = w; | ||
131 | lw->inv_weight = 0; | ||
132 | } | ||
133 | |||
116 | /* | 134 | /* |
117 | * Increase the granularity value when there are more CPUs, | 135 | * Increase the granularity value when there are more CPUs, |
118 | * because with more CPUs the 'effective latency' as visible | 136 | * because with more CPUs the 'effective latency' as visible |
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
662 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 680 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
663 | } | 681 | } |
664 | 682 | ||
683 | #ifdef CONFIG_SMP | ||
684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | ||
685 | |||
686 | /* Give new task start runnable values to heavy its load in infant time */ | ||
687 | void init_task_runnable_average(struct task_struct *p) | ||
688 | { | ||
689 | u32 slice; | ||
690 | |||
691 | p->se.avg.decay_count = 0; | ||
692 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | ||
693 | p->se.avg.runnable_avg_sum = slice; | ||
694 | p->se.avg.runnable_avg_period = slice; | ||
695 | __update_task_entity_contrib(&p->se); | ||
696 | } | ||
697 | #else | ||
698 | void init_task_runnable_average(struct task_struct *p) | ||
699 | { | ||
700 | } | ||
701 | #endif | ||
702 | |||
665 | /* | 703 | /* |
666 | * Update the current task's runtime statistics. Skip current tasks that | 704 | * Update the current task's runtime statistics. Skip current tasks that |
667 | * are not in our scheduling class. | 705 | * are not in our scheduling class. |
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
686 | static void update_curr(struct cfs_rq *cfs_rq) | 724 | static void update_curr(struct cfs_rq *cfs_rq) |
687 | { | 725 | { |
688 | struct sched_entity *curr = cfs_rq->curr; | 726 | struct sched_entity *curr = cfs_rq->curr; |
689 | u64 now = rq_of(cfs_rq)->clock_task; | 727 | u64 now = rq_clock_task(rq_of(cfs_rq)); |
690 | unsigned long delta_exec; | 728 | unsigned long delta_exec; |
691 | 729 | ||
692 | if (unlikely(!curr)) | 730 | if (unlikely(!curr)) |
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
718 | static inline void | 756 | static inline void |
719 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 757 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
720 | { | 758 | { |
721 | schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); | 759 | schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); |
722 | } | 760 | } |
723 | 761 | ||
724 | /* | 762 | /* |
@@ -738,14 +776,14 @@ static void | |||
738 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 776 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
739 | { | 777 | { |
740 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, | 778 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, |
741 | rq_of(cfs_rq)->clock - se->statistics.wait_start)); | 779 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); |
742 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); | 780 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); |
743 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + | 781 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + |
744 | rq_of(cfs_rq)->clock - se->statistics.wait_start); | 782 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); |
745 | #ifdef CONFIG_SCHEDSTATS | 783 | #ifdef CONFIG_SCHEDSTATS |
746 | if (entity_is_task(se)) { | 784 | if (entity_is_task(se)) { |
747 | trace_sched_stat_wait(task_of(se), | 785 | trace_sched_stat_wait(task_of(se), |
748 | rq_of(cfs_rq)->clock - se->statistics.wait_start); | 786 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); |
749 | } | 787 | } |
750 | #endif | 788 | #endif |
751 | schedstat_set(se->statistics.wait_start, 0); | 789 | schedstat_set(se->statistics.wait_start, 0); |
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
771 | /* | 809 | /* |
772 | * We are starting a new run period: | 810 | * We are starting a new run period: |
773 | */ | 811 | */ |
774 | se->exec_start = rq_of(cfs_rq)->clock_task; | 812 | se->exec_start = rq_clock_task(rq_of(cfs_rq)); |
775 | } | 813 | } |
776 | 814 | ||
777 | /************************************************** | 815 | /************************************************** |
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
1037 | * to gain a more accurate current total weight. See | 1075 | * to gain a more accurate current total weight. See |
1038 | * update_cfs_rq_load_contribution(). | 1076 | * update_cfs_rq_load_contribution(). |
1039 | */ | 1077 | */ |
1040 | tg_weight = atomic64_read(&tg->load_avg); | 1078 | tg_weight = atomic_long_read(&tg->load_avg); |
1041 | tg_weight -= cfs_rq->tg_load_contrib; | 1079 | tg_weight -= cfs_rq->tg_load_contrib; |
1042 | tg_weight += cfs_rq->load.weight; | 1080 | tg_weight += cfs_rq->load.weight; |
1043 | 1081 | ||
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
1110 | } | 1148 | } |
1111 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 1149 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
1112 | 1150 | ||
1113 | /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ | 1151 | #ifdef CONFIG_SMP |
1114 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1115 | /* | 1152 | /* |
1116 | * We choose a half-life close to 1 scheduling period. | 1153 | * We choose a half-life close to 1 scheduling period. |
1117 | * Note: The tables below are dependent on this value. | 1154 | * Note: The tables below are dependent on this value. |
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
1319 | int force_update) | 1356 | int force_update) |
1320 | { | 1357 | { |
1321 | struct task_group *tg = cfs_rq->tg; | 1358 | struct task_group *tg = cfs_rq->tg; |
1322 | s64 tg_contrib; | 1359 | long tg_contrib; |
1323 | 1360 | ||
1324 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 1361 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
1325 | tg_contrib -= cfs_rq->tg_load_contrib; | 1362 | tg_contrib -= cfs_rq->tg_load_contrib; |
1326 | 1363 | ||
1327 | if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 1364 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
1328 | atomic64_add(tg_contrib, &tg->load_avg); | 1365 | atomic_long_add(tg_contrib, &tg->load_avg); |
1329 | cfs_rq->tg_load_contrib += tg_contrib; | 1366 | cfs_rq->tg_load_contrib += tg_contrib; |
1330 | } | 1367 | } |
1331 | } | 1368 | } |
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
1360 | u64 contrib; | 1397 | u64 contrib; |
1361 | 1398 | ||
1362 | contrib = cfs_rq->tg_load_contrib * tg->shares; | 1399 | contrib = cfs_rq->tg_load_contrib * tg->shares; |
1363 | se->avg.load_avg_contrib = div64_u64(contrib, | 1400 | se->avg.load_avg_contrib = div_u64(contrib, |
1364 | atomic64_read(&tg->load_avg) + 1); | 1401 | atomic_long_read(&tg->load_avg) + 1); |
1365 | 1402 | ||
1366 | /* | 1403 | /* |
1367 | * For group entities we need to compute a correction term in the case | 1404 | * For group entities we need to compute a correction term in the case |
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
1480 | if (!decays && !force_update) | 1517 | if (!decays && !force_update) |
1481 | return; | 1518 | return; |
1482 | 1519 | ||
1483 | if (atomic64_read(&cfs_rq->removed_load)) { | 1520 | if (atomic_long_read(&cfs_rq->removed_load)) { |
1484 | u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); | 1521 | unsigned long removed_load; |
1522 | removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); | ||
1485 | subtract_blocked_load_contrib(cfs_rq, removed_load); | 1523 | subtract_blocked_load_contrib(cfs_rq, removed_load); |
1486 | } | 1524 | } |
1487 | 1525 | ||
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
1497 | 1535 | ||
1498 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 1536 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
1499 | { | 1537 | { |
1500 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | 1538 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); |
1501 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 1539 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
1502 | } | 1540 | } |
1503 | 1541 | ||
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1510 | * We track migrations using entity decay_count <= 0, on a wake-up | 1548 | * We track migrations using entity decay_count <= 0, on a wake-up |
1511 | * migration we use a negative decay count to track the remote decays | 1549 | * migration we use a negative decay count to track the remote decays |
1512 | * accumulated while sleeping. | 1550 | * accumulated while sleeping. |
1551 | * | ||
1552 | * Newly forked tasks are enqueued with se->avg.decay_count == 0, they | ||
1553 | * are seen by enqueue_entity_load_avg() as a migration with an already | ||
1554 | * constructed load_avg_contrib. | ||
1513 | */ | 1555 | */ |
1514 | if (unlikely(se->avg.decay_count <= 0)) { | 1556 | if (unlikely(se->avg.decay_count <= 0)) { |
1515 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | 1557 | se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); |
1516 | if (se->avg.decay_count) { | 1558 | if (se->avg.decay_count) { |
1517 | /* | 1559 | /* |
1518 | * In a wake-up migration we have to approximate the | 1560 | * In a wake-up migration we have to approximate the |
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1530 | } | 1572 | } |
1531 | wakeup = 0; | 1573 | wakeup = 0; |
1532 | } else { | 1574 | } else { |
1533 | __synchronize_entity_decay(se); | 1575 | /* |
1576 | * Task re-woke on same cpu (or else migrate_task_rq_fair() | ||
1577 | * would have made count negative); we must be careful to avoid | ||
1578 | * double-accounting blocked time after synchronizing decays. | ||
1579 | */ | ||
1580 | se->avg.last_runnable_update += __synchronize_entity_decay(se) | ||
1581 | << 20; | ||
1534 | } | 1582 | } |
1535 | 1583 | ||
1536 | /* migrated tasks did not contribute to our blocked load */ | 1584 | /* migrated tasks did not contribute to our blocked load */ |
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1607 | tsk = task_of(se); | 1655 | tsk = task_of(se); |
1608 | 1656 | ||
1609 | if (se->statistics.sleep_start) { | 1657 | if (se->statistics.sleep_start) { |
1610 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; | 1658 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; |
1611 | 1659 | ||
1612 | if ((s64)delta < 0) | 1660 | if ((s64)delta < 0) |
1613 | delta = 0; | 1661 | delta = 0; |
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1624 | } | 1672 | } |
1625 | } | 1673 | } |
1626 | if (se->statistics.block_start) { | 1674 | if (se->statistics.block_start) { |
1627 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; | 1675 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; |
1628 | 1676 | ||
1629 | if ((s64)delta < 0) | 1677 | if ((s64)delta < 0) |
1630 | delta = 0; | 1678 | delta = 0; |
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1712 | { | 1760 | { |
1713 | /* | 1761 | /* |
1714 | * Update the normalized vruntime before updating min_vruntime | 1762 | * Update the normalized vruntime before updating min_vruntime |
1715 | * through callig update_curr(). | 1763 | * through calling update_curr(). |
1716 | */ | 1764 | */ |
1717 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) | 1765 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) |
1718 | se->vruntime += cfs_rq->min_vruntime; | 1766 | se->vruntime += cfs_rq->min_vruntime; |
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1805 | struct task_struct *tsk = task_of(se); | 1853 | struct task_struct *tsk = task_of(se); |
1806 | 1854 | ||
1807 | if (tsk->state & TASK_INTERRUPTIBLE) | 1855 | if (tsk->state & TASK_INTERRUPTIBLE) |
1808 | se->statistics.sleep_start = rq_of(cfs_rq)->clock; | 1856 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); |
1809 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 1857 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
1810 | se->statistics.block_start = rq_of(cfs_rq)->clock; | 1858 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); |
1811 | } | 1859 | } |
1812 | #endif | 1860 | #endif |
1813 | } | 1861 | } |
@@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
2082 | if (unlikely(cfs_rq->throttle_count)) | 2130 | if (unlikely(cfs_rq->throttle_count)) |
2083 | return cfs_rq->throttled_clock_task; | 2131 | return cfs_rq->throttled_clock_task; |
2084 | 2132 | ||
2085 | return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; | 2133 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; |
2086 | } | 2134 | } |
2087 | 2135 | ||
2088 | /* returns 0 on failure to allocate runtime */ | 2136 | /* returns 0 on failure to allocate runtime */ |
@@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
2138 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 2186 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
2139 | { | 2187 | { |
2140 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | 2188 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); |
2141 | struct rq *rq = rq_of(cfs_rq); | ||
2142 | 2189 | ||
2143 | /* if the deadline is ahead of our clock, nothing to do */ | 2190 | /* if the deadline is ahead of our clock, nothing to do */ |
2144 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | 2191 | if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) |
2145 | return; | 2192 | return; |
2146 | 2193 | ||
2147 | if (cfs_rq->runtime_remaining < 0) | 2194 | if (cfs_rq->runtime_remaining < 0) |
@@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
2230 | #ifdef CONFIG_SMP | 2277 | #ifdef CONFIG_SMP |
2231 | if (!cfs_rq->throttle_count) { | 2278 | if (!cfs_rq->throttle_count) { |
2232 | /* adjust cfs_rq_clock_task() */ | 2279 | /* adjust cfs_rq_clock_task() */ |
2233 | cfs_rq->throttled_clock_task_time += rq->clock_task - | 2280 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
2234 | cfs_rq->throttled_clock_task; | 2281 | cfs_rq->throttled_clock_task; |
2235 | } | 2282 | } |
2236 | #endif | 2283 | #endif |
@@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
2245 | 2292 | ||
2246 | /* group is entering throttled state, stop time */ | 2293 | /* group is entering throttled state, stop time */ |
2247 | if (!cfs_rq->throttle_count) | 2294 | if (!cfs_rq->throttle_count) |
2248 | cfs_rq->throttled_clock_task = rq->clock_task; | 2295 | cfs_rq->throttled_clock_task = rq_clock_task(rq); |
2249 | cfs_rq->throttle_count++; | 2296 | cfs_rq->throttle_count++; |
2250 | 2297 | ||
2251 | return 0; | 2298 | return 0; |
@@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
2284 | rq->nr_running -= task_delta; | 2331 | rq->nr_running -= task_delta; |
2285 | 2332 | ||
2286 | cfs_rq->throttled = 1; | 2333 | cfs_rq->throttled = 1; |
2287 | cfs_rq->throttled_clock = rq->clock; | 2334 | cfs_rq->throttled_clock = rq_clock(rq); |
2288 | raw_spin_lock(&cfs_b->lock); | 2335 | raw_spin_lock(&cfs_b->lock); |
2289 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 2336 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
2290 | raw_spin_unlock(&cfs_b->lock); | 2337 | raw_spin_unlock(&cfs_b->lock); |
@@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
2298 | int enqueue = 1; | 2345 | int enqueue = 1; |
2299 | long task_delta; | 2346 | long task_delta; |
2300 | 2347 | ||
2301 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | 2348 | se = cfs_rq->tg->se[cpu_of(rq)]; |
2302 | 2349 | ||
2303 | cfs_rq->throttled = 0; | 2350 | cfs_rq->throttled = 0; |
2351 | |||
2352 | update_rq_clock(rq); | ||
2353 | |||
2304 | raw_spin_lock(&cfs_b->lock); | 2354 | raw_spin_lock(&cfs_b->lock); |
2305 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; | 2355 | cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; |
2306 | list_del_rcu(&cfs_rq->throttled_list); | 2356 | list_del_rcu(&cfs_rq->throttled_list); |
2307 | raw_spin_unlock(&cfs_b->lock); | 2357 | raw_spin_unlock(&cfs_b->lock); |
2308 | 2358 | ||
2309 | update_rq_clock(rq); | ||
2310 | /* update hierarchical throttle state */ | 2359 | /* update hierarchical throttle state */ |
2311 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | 2360 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); |
2312 | 2361 | ||
@@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
2599 | throttle_cfs_rq(cfs_rq); | 2648 | throttle_cfs_rq(cfs_rq); |
2600 | } | 2649 | } |
2601 | 2650 | ||
2602 | static inline u64 default_cfs_period(void); | ||
2603 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
2604 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
2605 | |||
2606 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | 2651 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) |
2607 | { | 2652 | { |
2608 | struct cfs_bandwidth *cfs_b = | 2653 | struct cfs_bandwidth *cfs_b = |
@@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2706 | #else /* CONFIG_CFS_BANDWIDTH */ | 2751 | #else /* CONFIG_CFS_BANDWIDTH */ |
2707 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 2752 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
2708 | { | 2753 | { |
2709 | return rq_of(cfs_rq)->clock_task; | 2754 | return rq_clock_task(rq_of(cfs_rq)); |
2710 | } | 2755 | } |
2711 | 2756 | ||
2712 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2757 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
@@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2919 | /* Used instead of source_load when we know the type == 0 */ | 2964 | /* Used instead of source_load when we know the type == 0 */ |
2920 | static unsigned long weighted_cpuload(const int cpu) | 2965 | static unsigned long weighted_cpuload(const int cpu) |
2921 | { | 2966 | { |
2922 | return cpu_rq(cpu)->load.weight; | 2967 | return cpu_rq(cpu)->cfs.runnable_load_avg; |
2923 | } | 2968 | } |
2924 | 2969 | ||
2925 | /* | 2970 | /* |
@@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
2964 | { | 3009 | { |
2965 | struct rq *rq = cpu_rq(cpu); | 3010 | struct rq *rq = cpu_rq(cpu); |
2966 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 3011 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
3012 | unsigned long load_avg = rq->cfs.runnable_load_avg; | ||
2967 | 3013 | ||
2968 | if (nr_running) | 3014 | if (nr_running) |
2969 | return rq->load.weight / nr_running; | 3015 | return load_avg / nr_running; |
2970 | 3016 | ||
2971 | return 0; | 3017 | return 0; |
2972 | } | 3018 | } |
@@ -3416,12 +3462,6 @@ unlock: | |||
3416 | } | 3462 | } |
3417 | 3463 | ||
3418 | /* | 3464 | /* |
3419 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
3420 | * removed when useful for applications beyond shares distribution (e.g. | ||
3421 | * load-balance). | ||
3422 | */ | ||
3423 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
3424 | /* | ||
3425 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 3465 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and |
3426 | * cfs_rq_of(p) references at time of call are still valid and identify the | 3466 | * cfs_rq_of(p) references at time of call are still valid and identify the |
3427 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 3467 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
@@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) | |||
3441 | */ | 3481 | */ |
3442 | if (se->avg.decay_count) { | 3482 | if (se->avg.decay_count) { |
3443 | se->avg.decay_count = -__synchronize_entity_decay(se); | 3483 | se->avg.decay_count = -__synchronize_entity_decay(se); |
3444 | atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); | 3484 | atomic_long_add(se->avg.load_avg_contrib, |
3485 | &cfs_rq->removed_load); | ||
3445 | } | 3486 | } |
3446 | } | 3487 | } |
3447 | #endif | ||
3448 | #endif /* CONFIG_SMP */ | 3488 | #endif /* CONFIG_SMP */ |
3449 | 3489 | ||
3450 | static unsigned long | 3490 | static unsigned long |
@@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3946 | * 2) too many balance attempts have failed. | 3986 | * 2) too many balance attempts have failed. |
3947 | */ | 3987 | */ |
3948 | 3988 | ||
3949 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); | 3989 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
3950 | if (!tsk_cache_hot || | 3990 | if (!tsk_cache_hot || |
3951 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 3991 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3952 | 3992 | ||
@@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
4141 | long cpu = (long)data; | 4181 | long cpu = (long)data; |
4142 | 4182 | ||
4143 | if (!tg->parent) { | 4183 | if (!tg->parent) { |
4144 | load = cpu_rq(cpu)->load.weight; | 4184 | load = cpu_rq(cpu)->avg.load_avg_contrib; |
4145 | } else { | 4185 | } else { |
4146 | load = tg->parent->cfs_rq[cpu]->h_load; | 4186 | load = tg->parent->cfs_rq[cpu]->h_load; |
4147 | load *= tg->se[cpu]->load.weight; | 4187 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, |
4148 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 4188 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); |
4149 | } | 4189 | } |
4150 | 4190 | ||
4151 | tg->cfs_rq[cpu]->h_load = load; | 4191 | tg->cfs_rq[cpu]->h_load = load; |
@@ -4171,12 +4211,9 @@ static void update_h_load(long cpu) | |||
4171 | static unsigned long task_h_load(struct task_struct *p) | 4211 | static unsigned long task_h_load(struct task_struct *p) |
4172 | { | 4212 | { |
4173 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4213 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4174 | unsigned long load; | ||
4175 | |||
4176 | load = p->se.load.weight; | ||
4177 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); | ||
4178 | 4214 | ||
4179 | return load; | 4215 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4216 | cfs_rq->runnable_load_avg + 1); | ||
4180 | } | 4217 | } |
4181 | #else | 4218 | #else |
4182 | static inline void update_blocked_averages(int cpu) | 4219 | static inline void update_blocked_averages(int cpu) |
@@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu) | |||
4189 | 4226 | ||
4190 | static unsigned long task_h_load(struct task_struct *p) | 4227 | static unsigned long task_h_load(struct task_struct *p) |
4191 | { | 4228 | { |
4192 | return p->se.load.weight; | 4229 | return p->se.avg.load_avg_contrib; |
4193 | } | 4230 | } |
4194 | #endif | 4231 | #endif |
4195 | 4232 | ||
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu) | |||
4302 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 4339 | age_stamp = ACCESS_ONCE(rq->age_stamp); |
4303 | avg = ACCESS_ONCE(rq->rt_avg); | 4340 | avg = ACCESS_ONCE(rq->rt_avg); |
4304 | 4341 | ||
4305 | total = sched_avg_period() + (rq->clock - age_stamp); | 4342 | total = sched_avg_period() + (rq_clock(rq) - age_stamp); |
4306 | 4343 | ||
4307 | if (unlikely(total < avg)) { | 4344 | if (unlikely(total < avg)) { |
4308 | /* Ensures that power won't end up being negative */ | 4345 | /* Ensures that power won't end up being negative */ |
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5241 | int pulled_task = 0; | 5278 | int pulled_task = 0; |
5242 | unsigned long next_balance = jiffies + HZ; | 5279 | unsigned long next_balance = jiffies + HZ; |
5243 | 5280 | ||
5244 | this_rq->idle_stamp = this_rq->clock; | 5281 | this_rq->idle_stamp = rq_clock(this_rq); |
5245 | 5282 | ||
5246 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5283 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
5247 | return; | 5284 | return; |
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
5418 | static inline void set_cpu_sd_state_busy(void) | 5455 | static inline void set_cpu_sd_state_busy(void) |
5419 | { | 5456 | { |
5420 | struct sched_domain *sd; | 5457 | struct sched_domain *sd; |
5421 | int cpu = smp_processor_id(); | ||
5422 | 5458 | ||
5423 | rcu_read_lock(); | 5459 | rcu_read_lock(); |
5424 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); | 5460 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); |
5425 | 5461 | ||
5426 | if (!sd || !sd->nohz_idle) | 5462 | if (!sd || !sd->nohz_idle) |
5427 | goto unlock; | 5463 | goto unlock; |
@@ -5436,10 +5472,9 @@ unlock: | |||
5436 | void set_cpu_sd_state_idle(void) | 5472 | void set_cpu_sd_state_idle(void) |
5437 | { | 5473 | { |
5438 | struct sched_domain *sd; | 5474 | struct sched_domain *sd; |
5439 | int cpu = smp_processor_id(); | ||
5440 | 5475 | ||
5441 | rcu_read_lock(); | 5476 | rcu_read_lock(); |
5442 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); | 5477 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); |
5443 | 5478 | ||
5444 | if (!sd || sd->nohz_idle) | 5479 | if (!sd || sd->nohz_idle) |
5445 | goto unlock; | 5480 | goto unlock; |
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5848 | se->vruntime -= cfs_rq->min_vruntime; | 5883 | se->vruntime -= cfs_rq->min_vruntime; |
5849 | } | 5884 | } |
5850 | 5885 | ||
5851 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | 5886 | #ifdef CONFIG_SMP |
5852 | /* | 5887 | /* |
5853 | * Remove our load from contribution when we leave sched_fair | 5888 | * Remove our load from contribution when we leave sched_fair |
5854 | * and ensure we don't carry in an old decay_count if we | 5889 | * and ensure we don't carry in an old decay_count if we |
@@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5907 | #ifndef CONFIG_64BIT | 5942 | #ifndef CONFIG_64BIT |
5908 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5943 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5909 | #endif | 5944 | #endif |
5910 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | 5945 | #ifdef CONFIG_SMP |
5911 | atomic64_set(&cfs_rq->decay_counter, 1); | 5946 | atomic64_set(&cfs_rq->decay_counter, 1); |
5912 | atomic64_set(&cfs_rq->removed_load, 0); | 5947 | atomic_long_set(&cfs_rq->removed_load, 0); |
5913 | #endif | 5948 | #endif |
5914 | } | 5949 | } |
5915 | 5950 | ||
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
6091 | se = tg->se[i]; | 6126 | se = tg->se[i]; |
6092 | /* Propagate contribution to hierarchy */ | 6127 | /* Propagate contribution to hierarchy */ |
6093 | raw_spin_lock_irqsave(&rq->lock, flags); | 6128 | raw_spin_lock_irqsave(&rq->lock, flags); |
6129 | |||
6130 | /* Possible calls to update_curr() need rq clock */ | ||
6131 | update_rq_clock(rq); | ||
6094 | for_each_sched_entity(se) | 6132 | for_each_sched_entity(se) |
6095 | update_cfs_shares(group_cfs_rq(se)); | 6133 | update_cfs_shares(group_cfs_rq(se)); |
6096 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6134 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = { | |||
6146 | 6184 | ||
6147 | #ifdef CONFIG_SMP | 6185 | #ifdef CONFIG_SMP |
6148 | .select_task_rq = select_task_rq_fair, | 6186 | .select_task_rq = select_task_rq_fair, |
6149 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6150 | .migrate_task_rq = migrate_task_rq_fair, | 6187 | .migrate_task_rq = migrate_task_rq_fair, |
6151 | #endif | 6188 | |
6152 | .rq_online = rq_online_fair, | 6189 | .rq_online = rq_online_fair, |
6153 | .rq_offline = rq_offline_fair, | 6190 | .rq_offline = rq_offline_fair, |
6154 | 6191 | ||
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..16f5a30f9c88 --- /dev/null +++ b/kernel/sched/proc.c | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * kernel/sched/proc.c | ||
3 | * | ||
4 | * Kernel load calculations, forked from sched/core.c | ||
5 | */ | ||
6 | |||
7 | #include <linux/export.h> | ||
8 | |||
9 | #include "sched.h" | ||
10 | |||
11 | unsigned long this_cpu_load(void) | ||
12 | { | ||
13 | struct rq *this = this_rq(); | ||
14 | return this->cpu_load[0]; | ||
15 | } | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Global load-average calculations | ||
20 | * | ||
21 | * We take a distributed and async approach to calculating the global load-avg | ||
22 | * in order to minimize overhead. | ||
23 | * | ||
24 | * The global load average is an exponentially decaying average of nr_running + | ||
25 | * nr_uninterruptible. | ||
26 | * | ||
27 | * Once every LOAD_FREQ: | ||
28 | * | ||
29 | * nr_active = 0; | ||
30 | * for_each_possible_cpu(cpu) | ||
31 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
32 | * | ||
33 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
34 | * | ||
35 | * Due to a number of reasons the above turns in the mess below: | ||
36 | * | ||
37 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
38 | * serious number of cpus, therefore we need to take a distributed approach | ||
39 | * to calculating nr_active. | ||
40 | * | ||
41 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
42 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
43 | * | ||
44 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
45 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
46 | * to obtain the same result. See calc_load_fold_active(). | ||
47 | * | ||
48 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
49 | * across the machine, we assume 10 ticks is sufficient time for every | ||
50 | * cpu to have completed this task. | ||
51 | * | ||
52 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
53 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
54 | * | ||
55 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
56 | * this would add another cross-cpu cacheline miss and atomic operation | ||
57 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
58 | * when it went into uninterruptible state and decrement on whatever cpu | ||
59 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
60 | * all cpus yields the correct result. | ||
61 | * | ||
62 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
63 | */ | ||
64 | |||
65 | /* Variables and functions for calc_load */ | ||
66 | atomic_long_t calc_load_tasks; | ||
67 | unsigned long calc_load_update; | ||
68 | unsigned long avenrun[3]; | ||
69 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
70 | |||
71 | /** | ||
72 | * get_avenrun - get the load average array | ||
73 | * @loads: pointer to dest load array | ||
74 | * @offset: offset to add | ||
75 | * @shift: shift count to shift the result left | ||
76 | * | ||
77 | * These values are estimates at best, so no need for locking. | ||
78 | */ | ||
79 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
80 | { | ||
81 | loads[0] = (avenrun[0] + offset) << shift; | ||
82 | loads[1] = (avenrun[1] + offset) << shift; | ||
83 | loads[2] = (avenrun[2] + offset) << shift; | ||
84 | } | ||
85 | |||
86 | long calc_load_fold_active(struct rq *this_rq) | ||
87 | { | ||
88 | long nr_active, delta = 0; | ||
89 | |||
90 | nr_active = this_rq->nr_running; | ||
91 | nr_active += (long) this_rq->nr_uninterruptible; | ||
92 | |||
93 | if (nr_active != this_rq->calc_load_active) { | ||
94 | delta = nr_active - this_rq->calc_load_active; | ||
95 | this_rq->calc_load_active = nr_active; | ||
96 | } | ||
97 | |||
98 | return delta; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * a1 = a0 * e + a * (1 - e) | ||
103 | */ | ||
104 | static unsigned long | ||
105 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
106 | { | ||
107 | load *= exp; | ||
108 | load += active * (FIXED_1 - exp); | ||
109 | load += 1UL << (FSHIFT - 1); | ||
110 | return load >> FSHIFT; | ||
111 | } | ||
112 | |||
113 | #ifdef CONFIG_NO_HZ_COMMON | ||
114 | /* | ||
115 | * Handle NO_HZ for the global load-average. | ||
116 | * | ||
117 | * Since the above described distributed algorithm to compute the global | ||
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
119 | * NO_HZ. | ||
120 | * | ||
121 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
123 | * when we read the global state. | ||
124 | * | ||
125 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
126 | * | ||
127 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
128 | * contribution, causing under-accounting. | ||
129 | * | ||
130 | * We avoid this by keeping two idle-delta counters and flipping them | ||
131 | * when the window starts, thus separating old and new NO_HZ load. | ||
132 | * | ||
133 | * The only trick is the slight shift in index flip for read vs write. | ||
134 | * | ||
135 | * 0s 5s 10s 15s | ||
136 | * +10 +10 +10 +10 | ||
137 | * |-|-----------|-|-----------|-|-----------|-| | ||
138 | * r:0 0 1 1 0 0 1 1 0 | ||
139 | * w:0 1 1 0 0 1 1 0 0 | ||
140 | * | ||
141 | * This ensures we'll fold the old idle contribution in this window while | ||
142 | * accumlating the new one. | ||
143 | * | ||
144 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
145 | * contribution, since we effectively move our sample point to a known | ||
146 | * busy state. | ||
147 | * | ||
148 | * This is solved by pushing the window forward, and thus skipping the | ||
149 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
150 | * was in effect at the time the window opened). This also solves the issue | ||
151 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
152 | * LOAD_FREQ intervals. | ||
153 | * | ||
154 | * When making the ILB scale, we should try to pull this in as well. | ||
155 | */ | ||
156 | static atomic_long_t calc_load_idle[2]; | ||
157 | static int calc_load_idx; | ||
158 | |||
159 | static inline int calc_load_write_idx(void) | ||
160 | { | ||
161 | int idx = calc_load_idx; | ||
162 | |||
163 | /* | ||
164 | * See calc_global_nohz(), if we observe the new index, we also | ||
165 | * need to observe the new update time. | ||
166 | */ | ||
167 | smp_rmb(); | ||
168 | |||
169 | /* | ||
170 | * If the folding window started, make sure we start writing in the | ||
171 | * next idle-delta. | ||
172 | */ | ||
173 | if (!time_before(jiffies, calc_load_update)) | ||
174 | idx++; | ||
175 | |||
176 | return idx & 1; | ||
177 | } | ||
178 | |||
179 | static inline int calc_load_read_idx(void) | ||
180 | { | ||
181 | return calc_load_idx & 1; | ||
182 | } | ||
183 | |||
184 | void calc_load_enter_idle(void) | ||
185 | { | ||
186 | struct rq *this_rq = this_rq(); | ||
187 | long delta; | ||
188 | |||
189 | /* | ||
190 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
191 | * into the pending idle delta. | ||
192 | */ | ||
193 | delta = calc_load_fold_active(this_rq); | ||
194 | if (delta) { | ||
195 | int idx = calc_load_write_idx(); | ||
196 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
197 | } | ||
198 | } | ||
199 | |||
200 | void calc_load_exit_idle(void) | ||
201 | { | ||
202 | struct rq *this_rq = this_rq(); | ||
203 | |||
204 | /* | ||
205 | * If we're still before the sample window, we're done. | ||
206 | */ | ||
207 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
208 | return; | ||
209 | |||
210 | /* | ||
211 | * We woke inside or after the sample window, this means we're already | ||
212 | * accounted through the nohz accounting, so skip the entire deal and | ||
213 | * sync up for the next window. | ||
214 | */ | ||
215 | this_rq->calc_load_update = calc_load_update; | ||
216 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
217 | this_rq->calc_load_update += LOAD_FREQ; | ||
218 | } | ||
219 | |||
220 | static long calc_load_fold_idle(void) | ||
221 | { | ||
222 | int idx = calc_load_read_idx(); | ||
223 | long delta = 0; | ||
224 | |||
225 | if (atomic_long_read(&calc_load_idle[idx])) | ||
226 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
227 | |||
228 | return delta; | ||
229 | } | ||
230 | |||
231 | /** | ||
232 | * fixed_power_int - compute: x^n, in O(log n) time | ||
233 | * | ||
234 | * @x: base of the power | ||
235 | * @frac_bits: fractional bits of @x | ||
236 | * @n: power to raise @x to. | ||
237 | * | ||
238 | * By exploiting the relation between the definition of the natural power | ||
239 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
240 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
241 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
242 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
243 | * of course trivially computable in O(log_2 n), the length of our binary | ||
244 | * vector. | ||
245 | */ | ||
246 | static unsigned long | ||
247 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
248 | { | ||
249 | unsigned long result = 1UL << frac_bits; | ||
250 | |||
251 | if (n) for (;;) { | ||
252 | if (n & 1) { | ||
253 | result *= x; | ||
254 | result += 1UL << (frac_bits - 1); | ||
255 | result >>= frac_bits; | ||
256 | } | ||
257 | n >>= 1; | ||
258 | if (!n) | ||
259 | break; | ||
260 | x *= x; | ||
261 | x += 1UL << (frac_bits - 1); | ||
262 | x >>= frac_bits; | ||
263 | } | ||
264 | |||
265 | return result; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * a1 = a0 * e + a * (1 - e) | ||
270 | * | ||
271 | * a2 = a1 * e + a * (1 - e) | ||
272 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
273 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
274 | * | ||
275 | * a3 = a2 * e + a * (1 - e) | ||
276 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
277 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
278 | * | ||
279 | * ... | ||
280 | * | ||
281 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
282 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
283 | * = a0 * e^n + a * (1 - e^n) | ||
284 | * | ||
285 | * [1] application of the geometric series: | ||
286 | * | ||
287 | * n 1 - x^(n+1) | ||
288 | * S_n := \Sum x^i = ------------- | ||
289 | * i=0 1 - x | ||
290 | */ | ||
291 | static unsigned long | ||
292 | calc_load_n(unsigned long load, unsigned long exp, | ||
293 | unsigned long active, unsigned int n) | ||
294 | { | ||
295 | |||
296 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
301 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
302 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
303 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
304 | * | ||
305 | * Once we've updated the global active value, we need to apply the exponential | ||
306 | * weights adjusted to the number of cycles missed. | ||
307 | */ | ||
308 | static void calc_global_nohz(void) | ||
309 | { | ||
310 | long delta, active, n; | ||
311 | |||
312 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
313 | /* | ||
314 | * Catch-up, fold however many we are behind still | ||
315 | */ | ||
316 | delta = jiffies - calc_load_update - 10; | ||
317 | n = 1 + (delta / LOAD_FREQ); | ||
318 | |||
319 | active = atomic_long_read(&calc_load_tasks); | ||
320 | active = active > 0 ? active * FIXED_1 : 0; | ||
321 | |||
322 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
323 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
324 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
325 | |||
326 | calc_load_update += n * LOAD_FREQ; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Flip the idle index... | ||
331 | * | ||
332 | * Make sure we first write the new time then flip the index, so that | ||
333 | * calc_load_write_idx() will see the new time when it reads the new | ||
334 | * index, this avoids a double flip messing things up. | ||
335 | */ | ||
336 | smp_wmb(); | ||
337 | calc_load_idx++; | ||
338 | } | ||
339 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
340 | |||
341 | static inline long calc_load_fold_idle(void) { return 0; } | ||
342 | static inline void calc_global_nohz(void) { } | ||
343 | |||
344 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
345 | |||
346 | /* | ||
347 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
348 | * CPUs have updated calc_load_tasks. | ||
349 | */ | ||
350 | void calc_global_load(unsigned long ticks) | ||
351 | { | ||
352 | long active, delta; | ||
353 | |||
354 | if (time_before(jiffies, calc_load_update + 10)) | ||
355 | return; | ||
356 | |||
357 | /* | ||
358 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
359 | */ | ||
360 | delta = calc_load_fold_idle(); | ||
361 | if (delta) | ||
362 | atomic_long_add(delta, &calc_load_tasks); | ||
363 | |||
364 | active = atomic_long_read(&calc_load_tasks); | ||
365 | active = active > 0 ? active * FIXED_1 : 0; | ||
366 | |||
367 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
368 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
369 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
370 | |||
371 | calc_load_update += LOAD_FREQ; | ||
372 | |||
373 | /* | ||
374 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
375 | */ | ||
376 | calc_global_nohz(); | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Called from update_cpu_load() to periodically update this CPU's | ||
381 | * active count. | ||
382 | */ | ||
383 | static void calc_load_account_active(struct rq *this_rq) | ||
384 | { | ||
385 | long delta; | ||
386 | |||
387 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
388 | return; | ||
389 | |||
390 | delta = calc_load_fold_active(this_rq); | ||
391 | if (delta) | ||
392 | atomic_long_add(delta, &calc_load_tasks); | ||
393 | |||
394 | this_rq->calc_load_update += LOAD_FREQ; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * End of global load-average stuff | ||
399 | */ | ||
400 | |||
401 | /* | ||
402 | * The exact cpuload at various idx values, calculated at every tick would be | ||
403 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
404 | * | ||
405 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
406 | * on nth tick when cpu may be busy, then we have: | ||
407 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
408 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
409 | * | ||
410 | * decay_load_missed() below does efficient calculation of | ||
411 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
412 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
413 | * | ||
414 | * The calculation is approximated on a 128 point scale. | ||
415 | * degrade_zero_ticks is the number of ticks after which load at any | ||
416 | * particular idx is approximated to be zero. | ||
417 | * degrade_factor is a precomputed table, a row for each load idx. | ||
418 | * Each column corresponds to degradation factor for a power of two ticks, | ||
419 | * based on 128 point scale. | ||
420 | * Example: | ||
421 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
422 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
423 | * | ||
424 | * With this power of 2 load factors, we can degrade the load n times | ||
425 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
426 | * n mult/shifts needed by the exact degradation. | ||
427 | */ | ||
428 | #define DEGRADE_SHIFT 7 | ||
429 | static const unsigned char | ||
430 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
431 | static const unsigned char | ||
432 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
433 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
434 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
435 | {96, 72, 40, 12, 1, 0, 0}, | ||
436 | {112, 98, 75, 43, 15, 1, 0}, | ||
437 | {120, 112, 98, 76, 45, 16, 2} }; | ||
438 | |||
439 | /* | ||
440 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
441 | * would be when CPU is idle and so we just decay the old load without | ||
442 | * adding any new load. | ||
443 | */ | ||
444 | static unsigned long | ||
445 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
446 | { | ||
447 | int j = 0; | ||
448 | |||
449 | if (!missed_updates) | ||
450 | return load; | ||
451 | |||
452 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
453 | return 0; | ||
454 | |||
455 | if (idx == 1) | ||
456 | return load >> missed_updates; | ||
457 | |||
458 | while (missed_updates) { | ||
459 | if (missed_updates % 2) | ||
460 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
461 | |||
462 | missed_updates >>= 1; | ||
463 | j++; | ||
464 | } | ||
465 | return load; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
470 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
471 | * every tick. We fix it up based on jiffies. | ||
472 | */ | ||
473 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
474 | unsigned long pending_updates) | ||
475 | { | ||
476 | int i, scale; | ||
477 | |||
478 | this_rq->nr_load_updates++; | ||
479 | |||
480 | /* Update our load: */ | ||
481 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
482 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
483 | unsigned long old_load, new_load; | ||
484 | |||
485 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
486 | |||
487 | old_load = this_rq->cpu_load[i]; | ||
488 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
489 | new_load = this_load; | ||
490 | /* | ||
491 | * Round up the averaging division if load is increasing. This | ||
492 | * prevents us from getting stuck on 9 if the load is 10, for | ||
493 | * example. | ||
494 | */ | ||
495 | if (new_load > old_load) | ||
496 | new_load += scale - 1; | ||
497 | |||
498 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
499 | } | ||
500 | |||
501 | sched_avg_update(this_rq); | ||
502 | } | ||
503 | |||
504 | #ifdef CONFIG_SMP | ||
505 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
506 | { | ||
507 | return rq->cfs.runnable_load_avg; | ||
508 | } | ||
509 | #else | ||
510 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
511 | { | ||
512 | return rq->load.weight; | ||
513 | } | ||
514 | #endif | ||
515 | |||
516 | #ifdef CONFIG_NO_HZ_COMMON | ||
517 | /* | ||
518 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
519 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
520 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
521 | * | ||
522 | * Therefore we cannot use the delta approach from the regular tick since that | ||
523 | * would seriously skew the load calculation. However we'll make do for those | ||
524 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
525 | * (tick_nohz_idle_exit). | ||
526 | * | ||
527 | * This means we might still be one tick off for nohz periods. | ||
528 | */ | ||
529 | |||
530 | /* | ||
531 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
532 | * idle balance. | ||
533 | */ | ||
534 | void update_idle_cpu_load(struct rq *this_rq) | ||
535 | { | ||
536 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
537 | unsigned long load = get_rq_runnable_load(this_rq); | ||
538 | unsigned long pending_updates; | ||
539 | |||
540 | /* | ||
541 | * bail if there's load or we're actually up-to-date. | ||
542 | */ | ||
543 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
544 | return; | ||
545 | |||
546 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
547 | this_rq->last_load_update_tick = curr_jiffies; | ||
548 | |||
549 | __update_cpu_load(this_rq, load, pending_updates); | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
554 | */ | ||
555 | void update_cpu_load_nohz(void) | ||
556 | { | ||
557 | struct rq *this_rq = this_rq(); | ||
558 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
559 | unsigned long pending_updates; | ||
560 | |||
561 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
562 | return; | ||
563 | |||
564 | raw_spin_lock(&this_rq->lock); | ||
565 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
566 | if (pending_updates) { | ||
567 | this_rq->last_load_update_tick = curr_jiffies; | ||
568 | /* | ||
569 | * We were idle, this means load 0, the current load might be | ||
570 | * !0 due to remote wakeups and the sort. | ||
571 | */ | ||
572 | __update_cpu_load(this_rq, 0, pending_updates); | ||
573 | } | ||
574 | raw_spin_unlock(&this_rq->lock); | ||
575 | } | ||
576 | #endif /* CONFIG_NO_HZ */ | ||
577 | |||
578 | /* | ||
579 | * Called from scheduler_tick() | ||
580 | */ | ||
581 | void update_cpu_load_active(struct rq *this_rq) | ||
582 | { | ||
583 | unsigned long load = get_rq_runnable_load(this_rq); | ||
584 | /* | ||
585 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
586 | */ | ||
587 | this_rq->last_load_update_tick = jiffies; | ||
588 | __update_cpu_load(this_rq, load, 1); | ||
589 | |||
590 | calc_load_account_active(this_rq); | ||
591 | } | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4ab..01970c8e64df 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) | |||
399 | (iter = next_task_group(iter)) && \ | 399 | (iter = next_task_group(iter)) && \ |
400 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) | 400 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) |
401 | 401 | ||
402 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
403 | { | ||
404 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
405 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
406 | } | ||
407 | |||
408 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
409 | { | ||
410 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
411 | } | ||
412 | |||
413 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
414 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
415 | |||
416 | #define for_each_sched_rt_entity(rt_se) \ | 402 | #define for_each_sched_rt_entity(rt_se) \ |
417 | for (; rt_se; rt_se = rt_se->parent) | 403 | for (; rt_se; rt_se = rt_se->parent) |
418 | 404 | ||
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
472 | #ifdef CONFIG_SMP | 458 | #ifdef CONFIG_SMP |
473 | static inline const struct cpumask *sched_rt_period_mask(void) | 459 | static inline const struct cpumask *sched_rt_period_mask(void) |
474 | { | 460 | { |
475 | return cpu_rq(smp_processor_id())->rd->span; | 461 | return this_rq()->rd->span; |
476 | } | 462 | } |
477 | #else | 463 | #else |
478 | static inline const struct cpumask *sched_rt_period_mask(void) | 464 | static inline const struct cpumask *sched_rt_period_mask(void) |
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t; | |||
509 | #define for_each_rt_rq(rt_rq, iter, rq) \ | 495 | #define for_each_rt_rq(rt_rq, iter, rq) \ |
510 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 496 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
511 | 497 | ||
512 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
513 | { | ||
514 | } | ||
515 | |||
516 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
517 | { | ||
518 | } | ||
519 | |||
520 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
521 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
522 | |||
523 | #define for_each_sched_rt_entity(rt_se) \ | 498 | #define for_each_sched_rt_entity(rt_se) \ |
524 | for (; rt_se; rt_se = NULL) | 499 | for (; rt_se; rt_se = NULL) |
525 | 500 | ||
@@ -699,15 +674,6 @@ balanced: | |||
699 | } | 674 | } |
700 | } | 675 | } |
701 | 676 | ||
702 | static void disable_runtime(struct rq *rq) | ||
703 | { | ||
704 | unsigned long flags; | ||
705 | |||
706 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
707 | __disable_runtime(rq); | ||
708 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
709 | } | ||
710 | |||
711 | static void __enable_runtime(struct rq *rq) | 677 | static void __enable_runtime(struct rq *rq) |
712 | { | 678 | { |
713 | rt_rq_iter_t iter; | 679 | rt_rq_iter_t iter; |
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq) | |||
732 | } | 698 | } |
733 | } | 699 | } |
734 | 700 | ||
735 | static void enable_runtime(struct rq *rq) | ||
736 | { | ||
737 | unsigned long flags; | ||
738 | |||
739 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
740 | __enable_runtime(rq); | ||
741 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
742 | } | ||
743 | |||
744 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
745 | { | ||
746 | int cpu = (int)(long)hcpu; | ||
747 | |||
748 | switch (action) { | ||
749 | case CPU_DOWN_PREPARE: | ||
750 | case CPU_DOWN_PREPARE_FROZEN: | ||
751 | disable_runtime(cpu_rq(cpu)); | ||
752 | return NOTIFY_OK; | ||
753 | |||
754 | case CPU_DOWN_FAILED: | ||
755 | case CPU_DOWN_FAILED_FROZEN: | ||
756 | case CPU_ONLINE: | ||
757 | case CPU_ONLINE_FROZEN: | ||
758 | enable_runtime(cpu_rq(cpu)); | ||
759 | return NOTIFY_OK; | ||
760 | |||
761 | default: | ||
762 | return NOTIFY_DONE; | ||
763 | } | ||
764 | } | ||
765 | |||
766 | static int balance_runtime(struct rt_rq *rt_rq) | 701 | static int balance_runtime(struct rt_rq *rt_rq) |
767 | { | 702 | { |
768 | int more = 0; | 703 | int more = 0; |
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq) | |||
926 | if (curr->sched_class != &rt_sched_class) | 861 | if (curr->sched_class != &rt_sched_class) |
927 | return; | 862 | return; |
928 | 863 | ||
929 | delta_exec = rq->clock_task - curr->se.exec_start; | 864 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
930 | if (unlikely((s64)delta_exec <= 0)) | 865 | if (unlikely((s64)delta_exec <= 0)) |
931 | return; | 866 | return; |
932 | 867 | ||
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq) | |||
936 | curr->se.sum_exec_runtime += delta_exec; | 871 | curr->se.sum_exec_runtime += delta_exec; |
937 | account_group_exec_runtime(curr, delta_exec); | 872 | account_group_exec_runtime(curr, delta_exec); |
938 | 873 | ||
939 | curr->se.exec_start = rq->clock_task; | 874 | curr->se.exec_start = rq_clock_task(rq); |
940 | cpuacct_charge(curr, delta_exec); | 875 | cpuacct_charge(curr, delta_exec); |
941 | 876 | ||
942 | sched_rt_avg_update(rq, delta_exec); | 877 | sched_rt_avg_update(rq, delta_exec); |
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
1106 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 1041 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
1107 | return; | 1042 | return; |
1108 | 1043 | ||
1109 | if (!rt_rq->rt_nr_running) | ||
1110 | list_add_leaf_rt_rq(rt_rq); | ||
1111 | |||
1112 | if (head) | 1044 | if (head) |
1113 | list_add(&rt_se->run_list, queue); | 1045 | list_add(&rt_se->run_list, queue); |
1114 | else | 1046 | else |
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
1128 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 1060 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
1129 | 1061 | ||
1130 | dec_rt_tasks(rt_se, rt_rq); | 1062 | dec_rt_tasks(rt_se, rt_rq); |
1131 | if (!rt_rq->rt_nr_running) | ||
1132 | list_del_leaf_rt_rq(rt_rq); | ||
1133 | } | 1063 | } |
1134 | 1064 | ||
1135 | /* | 1065 | /* |
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1385 | } while (rt_rq); | 1315 | } while (rt_rq); |
1386 | 1316 | ||
1387 | p = rt_task_of(rt_se); | 1317 | p = rt_task_of(rt_se); |
1388 | p->se.exec_start = rq->clock_task; | 1318 | p->se.exec_start = rq_clock_task(rq); |
1389 | 1319 | ||
1390 | return p; | 1320 | return p; |
1391 | } | 1321 | } |
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1434 | return 0; | 1364 | return 0; |
1435 | } | 1365 | } |
1436 | 1366 | ||
1437 | /* Return the second highest RT task, NULL otherwise */ | 1367 | /* |
1438 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | 1368 | * Return the highest pushable rq's task, which is suitable to be executed |
1369 | * on the cpu, NULL otherwise | ||
1370 | */ | ||
1371 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | ||
1439 | { | 1372 | { |
1440 | struct task_struct *next = NULL; | 1373 | struct plist_head *head = &rq->rt.pushable_tasks; |
1441 | struct sched_rt_entity *rt_se; | 1374 | struct task_struct *p; |
1442 | struct rt_prio_array *array; | ||
1443 | struct rt_rq *rt_rq; | ||
1444 | int idx; | ||
1445 | |||
1446 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
1447 | array = &rt_rq->active; | ||
1448 | idx = sched_find_first_bit(array->bitmap); | ||
1449 | next_idx: | ||
1450 | if (idx >= MAX_RT_PRIO) | ||
1451 | continue; | ||
1452 | if (next && next->prio <= idx) | ||
1453 | continue; | ||
1454 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
1455 | struct task_struct *p; | ||
1456 | 1375 | ||
1457 | if (!rt_entity_is_task(rt_se)) | 1376 | if (!has_pushable_tasks(rq)) |
1458 | continue; | 1377 | return NULL; |
1459 | 1378 | ||
1460 | p = rt_task_of(rt_se); | 1379 | plist_for_each_entry(p, head, pushable_tasks) { |
1461 | if (pick_rt_task(rq, p, cpu)) { | 1380 | if (pick_rt_task(rq, p, cpu)) |
1462 | next = p; | 1381 | return p; |
1463 | break; | ||
1464 | } | ||
1465 | } | ||
1466 | if (!next) { | ||
1467 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
1468 | goto next_idx; | ||
1469 | } | ||
1470 | } | 1382 | } |
1471 | 1383 | ||
1472 | return next; | 1384 | return NULL; |
1473 | } | 1385 | } |
1474 | 1386 | ||
1475 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); | 1387 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); |
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq) | |||
1743 | double_lock_balance(this_rq, src_rq); | 1655 | double_lock_balance(this_rq, src_rq); |
1744 | 1656 | ||
1745 | /* | 1657 | /* |
1746 | * Are there still pullable RT tasks? | 1658 | * We can pull only a task, which is pushable |
1659 | * on its rq, and no others. | ||
1747 | */ | 1660 | */ |
1748 | if (src_rq->rt.rt_nr_running <= 1) | 1661 | p = pick_highest_pushable_task(src_rq, this_cpu); |
1749 | goto skip; | ||
1750 | |||
1751 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
1752 | 1662 | ||
1753 | /* | 1663 | /* |
1754 | * Do we have an RT task that preempts | 1664 | * Do we have an RT task that preempts |
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
2037 | { | 1947 | { |
2038 | struct task_struct *p = rq->curr; | 1948 | struct task_struct *p = rq->curr; |
2039 | 1949 | ||
2040 | p->se.exec_start = rq->clock_task; | 1950 | p->se.exec_start = rq_clock_task(rq); |
2041 | 1951 | ||
2042 | /* The running task is never eligible for pushing */ | 1952 | /* The running task is never eligible for pushing */ |
2043 | dequeue_pushable_task(rq, p); | 1953 | dequeue_pushable_task(rq, p); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..ef0a7b2439dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -10,8 +10,16 @@ | |||
10 | #include "cpupri.h" | 10 | #include "cpupri.h" |
11 | #include "cpuacct.h" | 11 | #include "cpuacct.h" |
12 | 12 | ||
13 | struct rq; | ||
14 | |||
13 | extern __read_mostly int scheduler_running; | 15 | extern __read_mostly int scheduler_running; |
14 | 16 | ||
17 | extern unsigned long calc_load_update; | ||
18 | extern atomic_long_t calc_load_tasks; | ||
19 | |||
20 | extern long calc_load_fold_active(struct rq *this_rq); | ||
21 | extern void update_cpu_load_active(struct rq *this_rq); | ||
22 | |||
15 | /* | 23 | /* |
16 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 24 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
17 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 25 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -140,10 +148,11 @@ struct task_group { | |||
140 | struct cfs_rq **cfs_rq; | 148 | struct cfs_rq **cfs_rq; |
141 | unsigned long shares; | 149 | unsigned long shares; |
142 | 150 | ||
143 | atomic_t load_weight; | 151 | #ifdef CONFIG_SMP |
144 | atomic64_t load_avg; | 152 | atomic_long_t load_avg; |
145 | atomic_t runnable_avg; | 153 | atomic_t runnable_avg; |
146 | #endif | 154 | #endif |
155 | #endif | ||
147 | 156 | ||
148 | #ifdef CONFIG_RT_GROUP_SCHED | 157 | #ifdef CONFIG_RT_GROUP_SCHED |
149 | struct sched_rt_entity **rt_se; | 158 | struct sched_rt_entity **rt_se; |
@@ -261,26 +270,21 @@ struct cfs_rq { | |||
261 | #endif | 270 | #endif |
262 | 271 | ||
263 | #ifdef CONFIG_SMP | 272 | #ifdef CONFIG_SMP |
264 | /* | ||
265 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
266 | * removed when useful for applications beyond shares distribution (e.g. | ||
267 | * load-balance). | ||
268 | */ | ||
269 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
270 | /* | 273 | /* |
271 | * CFS Load tracking | 274 | * CFS Load tracking |
272 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 275 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
273 | * This allows for the description of both thread and group usage (in | 276 | * This allows for the description of both thread and group usage (in |
274 | * the FAIR_GROUP_SCHED case). | 277 | * the FAIR_GROUP_SCHED case). |
275 | */ | 278 | */ |
276 | u64 runnable_load_avg, blocked_load_avg; | 279 | unsigned long runnable_load_avg, blocked_load_avg; |
277 | atomic64_t decay_counter, removed_load; | 280 | atomic64_t decay_counter; |
278 | u64 last_decay; | 281 | u64 last_decay; |
279 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 282 | atomic_long_t removed_load; |
280 | /* These always depend on CONFIG_FAIR_GROUP_SCHED */ | 283 | |
281 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
285 | /* Required to track per-cpu representation of a task_group */ | ||
282 | u32 tg_runnable_contrib; | 286 | u32 tg_runnable_contrib; |
283 | u64 tg_load_contrib; | 287 | unsigned long tg_load_contrib; |
284 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
285 | 289 | ||
286 | /* | 290 | /* |
@@ -353,7 +357,6 @@ struct rt_rq { | |||
353 | unsigned long rt_nr_boosted; | 357 | unsigned long rt_nr_boosted; |
354 | 358 | ||
355 | struct rq *rq; | 359 | struct rq *rq; |
356 | struct list_head leaf_rt_rq_list; | ||
357 | struct task_group *tg; | 360 | struct task_group *tg; |
358 | #endif | 361 | #endif |
359 | }; | 362 | }; |
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues); | |||
540 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 543 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
541 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | 544 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) |
542 | 545 | ||
546 | static inline u64 rq_clock(struct rq *rq) | ||
547 | { | ||
548 | return rq->clock; | ||
549 | } | ||
550 | |||
551 | static inline u64 rq_clock_task(struct rq *rq) | ||
552 | { | ||
553 | return rq->clock_task; | ||
554 | } | ||
555 | |||
543 | #ifdef CONFIG_SMP | 556 | #ifdef CONFIG_SMP |
544 | 557 | ||
545 | #define rcu_dereference_check_sched_domain(p) \ | 558 | #define rcu_dereference_check_sched_domain(p) \ |
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
884 | #define WF_FORK 0x02 /* child wakeup after fork */ | 897 | #define WF_FORK 0x02 /* child wakeup after fork */ |
885 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 898 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ |
886 | 899 | ||
887 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
888 | { | ||
889 | lw->weight += inc; | ||
890 | lw->inv_weight = 0; | ||
891 | } | ||
892 | |||
893 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
894 | { | ||
895 | lw->weight -= dec; | ||
896 | lw->inv_weight = 0; | ||
897 | } | ||
898 | |||
899 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
900 | { | ||
901 | lw->weight = w; | ||
902 | lw->inv_weight = 0; | ||
903 | } | ||
904 | |||
905 | /* | 900 | /* |
906 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 901 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
907 | * of tasks with abnormal "nice" values across CPUs the contribution that | 902 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu); | |||
1028 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1023 | extern void trigger_load_balance(struct rq *rq, int cpu); |
1029 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1024 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1030 | 1025 | ||
1031 | /* | ||
1032 | * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg | ||
1033 | * becomes useful in lb | ||
1034 | */ | ||
1035 | #if defined(CONFIG_FAIR_GROUP_SCHED) | ||
1036 | extern void idle_enter_fair(struct rq *this_rq); | 1026 | extern void idle_enter_fair(struct rq *this_rq); |
1037 | extern void idle_exit_fair(struct rq *this_rq); | 1027 | extern void idle_exit_fair(struct rq *this_rq); |
1038 | #else | ||
1039 | static inline void idle_enter_fair(struct rq *this_rq) {} | ||
1040 | static inline void idle_exit_fair(struct rq *this_rq) {} | ||
1041 | #endif | ||
1042 | 1028 | ||
1043 | #else /* CONFIG_SMP */ | 1029 | #else /* CONFIG_SMP */ |
1044 | 1030 | ||
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
1051 | extern void sysrq_sched_debug_show(void); | 1037 | extern void sysrq_sched_debug_show(void); |
1052 | extern void sched_init_granularity(void); | 1038 | extern void sched_init_granularity(void); |
1053 | extern void update_max_interval(void); | 1039 | extern void update_max_interval(void); |
1054 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
1055 | extern void init_sched_rt_class(void); | 1040 | extern void init_sched_rt_class(void); |
1056 | extern void init_sched_fair_class(void); | 1041 | extern void init_sched_fair_class(void); |
1057 | 1042 | ||
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
1063 | 1048 | ||
1064 | extern void update_idle_cpu_load(struct rq *this_rq); | 1049 | extern void update_idle_cpu_load(struct rq *this_rq); |
1065 | 1050 | ||
1051 | extern void init_task_runnable_average(struct task_struct *p); | ||
1052 | |||
1066 | #ifdef CONFIG_PARAVIRT | 1053 | #ifdef CONFIG_PARAVIRT |
1067 | static inline u64 steal_ticks(u64 steal) | 1054 | static inline u64 steal_ticks(u64 steal) |
1068 | { | 1055 | { |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..17d7065c3872 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
61 | */ | 61 | */ |
62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct task_struct *t) |
63 | { | 63 | { |
64 | unsigned long long now = task_rq(t)->clock, delta = 0; | 64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; |
65 | 65 | ||
66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
79 | */ | 79 | */ |
80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct task_struct *t) |
81 | { | 81 | { |
82 | unsigned long long now = task_rq(t)->clock, delta = 0; | 82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; |
83 | 83 | ||
84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t) | |||
100 | { | 100 | { |
101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
103 | t->sched_info.last_queued = task_rq(t)->clock; | 103 | t->sched_info.last_queued = rq_clock(task_rq(t)); |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t) | |||
112 | */ | 112 | */ |
113 | static inline void sched_info_depart(struct task_struct *t) | 113 | static inline void sched_info_depart(struct task_struct *t) |
114 | { | 114 | { |
115 | unsigned long long delta = task_rq(t)->clock - | 115 | unsigned long long delta = rq_clock(task_rq(t)) - |
116 | t->sched_info.last_arrival; | 116 | t->sched_info.last_arrival; |
117 | 117 | ||
118 | rq_sched_info_depart(task_rq(t), delta); | 118 | rq_sched_info_depart(task_rq(t), delta); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84a..e08fbeeb54b9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
28 | struct task_struct *stop = rq->stop; | 28 | struct task_struct *stop = rq->stop; |
29 | 29 | ||
30 | if (stop && stop->on_rq) { | 30 | if (stop && stop->on_rq) { |
31 | stop->se.exec_start = rq->clock_task; | 31 | stop->se.exec_start = rq_clock_task(rq); |
32 | return stop; | 32 | return stop; |
33 | } | 33 | } |
34 | 34 | ||
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
57 | struct task_struct *curr = rq->curr; | 57 | struct task_struct *curr = rq->curr; |
58 | u64 delta_exec; | 58 | u64 delta_exec; |
59 | 59 | ||
60 | delta_exec = rq->clock_task - curr->se.exec_start; | 60 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
61 | if (unlikely((s64)delta_exec < 0)) | 61 | if (unlikely((s64)delta_exec < 0)) |
62 | delta_exec = 0; | 62 | delta_exec = 0; |
63 | 63 | ||
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
67 | curr->se.sum_exec_runtime += delta_exec; | 67 | curr->se.sum_exec_runtime += delta_exec; |
68 | account_group_exec_runtime(curr, delta_exec); | 68 | account_group_exec_runtime(curr, delta_exec); |
69 | 69 | ||
70 | curr->se.exec_start = rq->clock_task; | 70 | curr->se.exec_start = rq_clock_task(rq); |
71 | cpuacct_charge(curr, delta_exec); | 71 | cpuacct_charge(curr, delta_exec); |
72 | } | 72 | } |
73 | 73 | ||
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) | |||
79 | { | 79 | { |
80 | struct task_struct *stop = rq->stop; | 80 | struct task_struct *stop = rq->stop; |
81 | 81 | ||
82 | stop->se.exec_start = rq->clock_task; | 82 | stop->se.exec_start = rq_clock_task(rq); |
83 | } | 83 | } |
84 | 84 | ||
85 | static void switched_to_stop(struct rq *rq, struct task_struct *p) | 85 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..ca25e6e704a2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
127 | 127 | ||
128 | void local_bh_disable(void) | 128 | void local_bh_disable(void) |
129 | { | 129 | { |
130 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 130 | __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); |
131 | SOFTIRQ_DISABLE_OFFSET); | ||
132 | } | 131 | } |
133 | 132 | ||
134 | EXPORT_SYMBOL(local_bh_disable); | 133 | EXPORT_SYMBOL(local_bh_disable); |
@@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) | |||
139 | WARN_ON_ONCE(!irqs_disabled()); | 138 | WARN_ON_ONCE(!irqs_disabled()); |
140 | 139 | ||
141 | if (softirq_count() == cnt) | 140 | if (softirq_count() == cnt) |
142 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | 141 | trace_softirqs_on(_RET_IP_); |
143 | sub_preempt_count(cnt); | 142 | sub_preempt_count(cnt); |
144 | } | 143 | } |
145 | 144 | ||
@@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
184 | 183 | ||
185 | void local_bh_enable(void) | 184 | void local_bh_enable(void) |
186 | { | 185 | { |
187 | _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 186 | _local_bh_enable_ip(_RET_IP_); |
188 | } | 187 | } |
189 | EXPORT_SYMBOL(local_bh_enable); | 188 | EXPORT_SYMBOL(local_bh_enable); |
190 | 189 | ||
@@ -195,8 +194,12 @@ void local_bh_enable_ip(unsigned long ip) | |||
195 | EXPORT_SYMBOL(local_bh_enable_ip); | 194 | EXPORT_SYMBOL(local_bh_enable_ip); |
196 | 195 | ||
197 | /* | 196 | /* |
198 | * We restart softirq processing for at most 2 ms, | 197 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, |
199 | * and if need_resched() is not set. | 198 | * but break the loop if need_resched() is set or after 2 ms. |
199 | * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in | ||
200 | * certain cases, such as stop_machine(), jiffies may cease to | ||
201 | * increment and so we need the MAX_SOFTIRQ_RESTART limit as | ||
202 | * well to make sure we eventually return from this method. | ||
200 | * | 203 | * |
201 | * These limits have been established via experimentation. | 204 | * These limits have been established via experimentation. |
202 | * The two things to balance is latency against fairness - | 205 | * The two things to balance is latency against fairness - |
@@ -204,6 +207,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); | |||
204 | * should not be able to lock up the box. | 207 | * should not be able to lock up the box. |
205 | */ | 208 | */ |
206 | #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) | 209 | #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) |
210 | #define MAX_SOFTIRQ_RESTART 10 | ||
207 | 211 | ||
208 | asmlinkage void __do_softirq(void) | 212 | asmlinkage void __do_softirq(void) |
209 | { | 213 | { |
@@ -212,6 +216,7 @@ asmlinkage void __do_softirq(void) | |||
212 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 216 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
213 | int cpu; | 217 | int cpu; |
214 | unsigned long old_flags = current->flags; | 218 | unsigned long old_flags = current->flags; |
219 | int max_restart = MAX_SOFTIRQ_RESTART; | ||
215 | 220 | ||
216 | /* | 221 | /* |
217 | * Mask out PF_MEMALLOC s current task context is borrowed for the | 222 | * Mask out PF_MEMALLOC s current task context is borrowed for the |
@@ -223,8 +228,7 @@ asmlinkage void __do_softirq(void) | |||
223 | pending = local_softirq_pending(); | 228 | pending = local_softirq_pending(); |
224 | account_irq_enter_time(current); | 229 | account_irq_enter_time(current); |
225 | 230 | ||
226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 231 | __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); |
227 | SOFTIRQ_OFFSET); | ||
228 | lockdep_softirq_enter(); | 232 | lockdep_softirq_enter(); |
229 | 233 | ||
230 | cpu = smp_processor_id(); | 234 | cpu = smp_processor_id(); |
@@ -265,7 +269,8 @@ restart: | |||
265 | 269 | ||
266 | pending = local_softirq_pending(); | 270 | pending = local_softirq_pending(); |
267 | if (pending) { | 271 | if (pending) { |
268 | if (time_before(jiffies, end) && !need_resched()) | 272 | if (time_before(jiffies, end) && !need_resched() && |
273 | --max_restart) | ||
269 | goto restart; | 274 | goto restart; |
270 | 275 | ||
271 | wakeup_softirqd(); | 276 | wakeup_softirqd(); |
diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) | |||
362 | } | 362 | } |
363 | EXPORT_SYMBOL(unregister_reboot_notifier); | 363 | EXPORT_SYMBOL(unregister_reboot_notifier); |
364 | 364 | ||
365 | /* Add backwards compatibility for stable trees. */ | ||
366 | #ifndef PF_NO_SETAFFINITY | ||
367 | #define PF_NO_SETAFFINITY PF_THREAD_BOUND | ||
368 | #endif | ||
369 | |||
370 | static void migrate_to_reboot_cpu(void) | ||
371 | { | ||
372 | /* The boot cpu is always logical cpu 0 */ | ||
373 | int cpu = 0; | ||
374 | |||
375 | cpu_hotplug_disable(); | ||
376 | |||
377 | /* Make certain the cpu I'm about to reboot on is online */ | ||
378 | if (!cpu_online(cpu)) | ||
379 | cpu = cpumask_first(cpu_online_mask); | ||
380 | |||
381 | /* Prevent races with other tasks migrating this task */ | ||
382 | current->flags |= PF_NO_SETAFFINITY; | ||
383 | |||
384 | /* Make certain I only run on the appropriate processor */ | ||
385 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
386 | } | ||
387 | |||
365 | /** | 388 | /** |
366 | * kernel_restart - reboot the system | 389 | * kernel_restart - reboot the system |
367 | * @cmd: pointer to buffer containing command to execute for restart | 390 | * @cmd: pointer to buffer containing command to execute for restart |
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); | |||
373 | void kernel_restart(char *cmd) | 396 | void kernel_restart(char *cmd) |
374 | { | 397 | { |
375 | kernel_restart_prepare(cmd); | 398 | kernel_restart_prepare(cmd); |
376 | disable_nonboot_cpus(); | 399 | migrate_to_reboot_cpu(); |
377 | syscore_shutdown(); | 400 | syscore_shutdown(); |
378 | if (!cmd) | 401 | if (!cmd) |
379 | printk(KERN_EMERG "Restarting system.\n"); | 402 | printk(KERN_EMERG "Restarting system.\n"); |
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
400 | void kernel_halt(void) | 423 | void kernel_halt(void) |
401 | { | 424 | { |
402 | kernel_shutdown_prepare(SYSTEM_HALT); | 425 | kernel_shutdown_prepare(SYSTEM_HALT); |
403 | disable_nonboot_cpus(); | 426 | migrate_to_reboot_cpu(); |
404 | syscore_shutdown(); | 427 | syscore_shutdown(); |
405 | printk(KERN_EMERG "System halted.\n"); | 428 | printk(KERN_EMERG "System halted.\n"); |
406 | kmsg_dump(KMSG_DUMP_HALT); | 429 | kmsg_dump(KMSG_DUMP_HALT); |
@@ -419,7 +442,7 @@ void kernel_power_off(void) | |||
419 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | 442 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); |
420 | if (pm_power_off_prepare) | 443 | if (pm_power_off_prepare) |
421 | pm_power_off_prepare(); | 444 | pm_power_off_prepare(); |
422 | disable_nonboot_cpus(); | 445 | migrate_to_reboot_cpu(); |
423 | syscore_shutdown(); | 446 | syscore_shutdown(); |
424 | printk(KERN_EMERG "Power down.\n"); | 447 | printk(KERN_EMERG "Power down.\n"); |
425 | kmsg_dump(KMSG_DUMP_POWEROFF); | 448 | kmsg_dump(KMSG_DUMP_POWEROFF); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..4ce13c3cedb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled; | |||
120 | /* Constants used for minimum and maximum */ | 120 | /* Constants used for minimum and maximum */ |
121 | #ifdef CONFIG_LOCKUP_DETECTOR | 121 | #ifdef CONFIG_LOCKUP_DETECTOR |
122 | static int sixty = 60; | 122 | static int sixty = 60; |
123 | static int neg_one = -1; | ||
124 | #endif | 123 | #endif |
125 | 124 | ||
126 | static int zero; | 125 | static int zero; |
@@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = { | |||
814 | .maxlen = sizeof(int), | 813 | .maxlen = sizeof(int), |
815 | .mode = 0644, | 814 | .mode = 0644, |
816 | .proc_handler = proc_dowatchdog, | 815 | .proc_handler = proc_dowatchdog, |
817 | .extra1 = &neg_one, | 816 | .extra1 = &zero, |
818 | .extra2 = &sixty, | 817 | .extra2 = &sixty, |
819 | }, | 818 | }, |
820 | { | 819 | { |
@@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = { | |||
1044 | .mode = 0644, | 1043 | .mode = 0644, |
1045 | .proc_handler = perf_proc_update_handler, | 1044 | .proc_handler = perf_proc_update_handler, |
1046 | }, | 1045 | }, |
1046 | { | ||
1047 | .procname = "perf_cpu_time_max_percent", | ||
1048 | .data = &sysctl_perf_cpu_time_max_percent, | ||
1049 | .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), | ||
1050 | .mode = 0644, | ||
1051 | .proc_handler = perf_cpu_time_max_percent_handler, | ||
1052 | .extra1 = &zero, | ||
1053 | .extra2 = &one_hundred, | ||
1054 | }, | ||
1047 | #endif | 1055 | #endif |
1048 | #ifdef CONFIG_KMEMCHECK | 1056 | #ifdef CONFIG_KMEMCHECK |
1049 | { | 1057 | { |
diff --git a/kernel/time.c b/kernel/time.c index d3617dbd3dca..7c7964c33ae7 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * Modification history kernel/time.c | 11 | * Modification history kernel/time.c |
12 | * | 12 | * |
13 | * 1993-09-02 Philip Gladstone | 13 | * 1993-09-02 Philip Gladstone |
14 | * Created file with time related functions from sched.c and adjtimex() | 14 | * Created file with time related functions from sched/core.c and adjtimex() |
15 | * 1993-10-08 Torsten Duwe | 15 | * 1993-10-08 Torsten Duwe |
16 | * adjtime interface update and CMOS clock write code | 16 | * adjtime interface update and CMOS clock write code |
17 | * 1995-08-13 Torsten Duwe | 17 | * 1995-08-13 Torsten Duwe |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 12ff13a838c6..8f5b3b98577b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error) | |||
874 | void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | 874 | void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) |
875 | { | 875 | { |
876 | struct pps_normtime pts_norm, freq_norm; | 876 | struct pps_normtime pts_norm, freq_norm; |
877 | unsigned long flags; | ||
878 | 877 | ||
879 | pts_norm = pps_normalize_ts(*phase_ts); | 878 | pts_norm = pps_normalize_ts(*phase_ts); |
880 | 879 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..20d6fba70652 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -511,6 +511,12 @@ again: | |||
511 | } | 511 | } |
512 | } | 512 | } |
513 | 513 | ||
514 | /* | ||
515 | * Remove the current cpu from the pending mask. The event is | ||
516 | * delivered immediately in tick_do_broadcast() ! | ||
517 | */ | ||
518 | cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); | ||
519 | |||
514 | /* Take care of enforced broadcast requests */ | 520 | /* Take care of enforced broadcast requests */ |
515 | cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); | 521 | cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); |
516 | cpumask_clear(tick_broadcast_force_mask); | 522 | cpumask_clear(tick_broadcast_force_mask); |
@@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
575 | 581 | ||
576 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 582 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
577 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | 583 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { |
578 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | ||
579 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { | 584 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { |
585 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | ||
580 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 586 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); |
581 | /* | 587 | /* |
582 | * We only reprogram the broadcast timer if we | 588 | * We only reprogram the broadcast timer if we |
@@ -593,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
593 | } else { | 599 | } else { |
594 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { | 600 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { |
595 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 601 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
596 | if (dev->next_event.tv64 == KTIME_MAX) | ||
597 | goto out; | ||
598 | /* | 602 | /* |
599 | * The cpu which was handling the broadcast | 603 | * The cpu which was handling the broadcast |
600 | * timer marked this cpu in the broadcast | 604 | * timer marked this cpu in the broadcast |
@@ -609,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
609 | goto out; | 613 | goto out; |
610 | 614 | ||
611 | /* | 615 | /* |
616 | * Bail out if there is no next event. | ||
617 | */ | ||
618 | if (dev->next_event.tv64 == KTIME_MAX) | ||
619 | goto out; | ||
620 | /* | ||
612 | * If the pending bit is not set, then we are | 621 | * If the pending bit is not set, then we are |
613 | * either the CPU handling the broadcast | 622 | * either the CPU handling the broadcast |
614 | * interrupt or we got woken by something else. | 623 | * interrupt or we got woken by something else. |
@@ -692,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
692 | 701 | ||
693 | bc->event_handler = tick_handle_oneshot_broadcast; | 702 | bc->event_handler = tick_handle_oneshot_broadcast; |
694 | 703 | ||
695 | /* Take the do_timer update */ | ||
696 | if (!tick_nohz_full_cpu(cpu)) | ||
697 | tick_do_timer_cpu = cpu; | ||
698 | |||
699 | /* | 704 | /* |
700 | * We must be careful here. There might be other CPUs | 705 | * We must be careful here. There might be other CPUs |
701 | * waiting for periodic broadcast. We need to set the | 706 | * waiting for periodic broadcast. We need to set the |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, | |||
306 | * we can't safely shutdown that CPU. | 306 | * we can't safely shutdown that CPU. |
307 | */ | 307 | */ |
308 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | 308 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) |
309 | return -EINVAL; | 309 | return NOTIFY_BAD; |
310 | break; | 310 | break; |
311 | } | 311 | } |
312 | return NOTIFY_OK; | 312 | return NOTIFY_OK; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..baeeb5c87cf1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -975,6 +975,14 @@ static int timekeeping_suspend(void) | |||
975 | 975 | ||
976 | read_persistent_clock(&timekeeping_suspend_time); | 976 | read_persistent_clock(&timekeeping_suspend_time); |
977 | 977 | ||
978 | /* | ||
979 | * On some systems the persistent_clock can not be detected at | ||
980 | * timekeeping_init by its return value, so if we see a valid | ||
981 | * value returned, update the persistent_clock_exists flag. | ||
982 | */ | ||
983 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | ||
984 | persistent_clock_exist = true; | ||
985 | |||
978 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 986 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
979 | write_seqcount_begin(&timekeeper_seq); | 987 | write_seqcount_begin(&timekeeper_seq); |
980 | timekeeping_forward_now(tk); | 988 | timekeeping_forward_now(tk); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b549b0f5b977..6c508ff33c62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); | |||
120 | 120 | ||
121 | /* | 121 | /* |
122 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | 122 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
123 | * can use rcu_dereference_raw() is that elements removed from this list | 123 | * can use rcu_dereference_raw_notrace() is that elements removed from this list |
124 | * are simply leaked, so there is no need to interact with a grace-period | 124 | * are simply leaked, so there is no need to interact with a grace-period |
125 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 125 | * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle |
126 | * concurrent insertions into the ftrace_global_list. | 126 | * concurrent insertions into the ftrace_global_list. |
127 | * | 127 | * |
128 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 128 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
129 | */ | 129 | */ |
130 | #define do_for_each_ftrace_op(op, list) \ | 130 | #define do_for_each_ftrace_op(op, list) \ |
131 | op = rcu_dereference_raw(list); \ | 131 | op = rcu_dereference_raw_notrace(list); \ |
132 | do | 132 | do |
133 | 133 | ||
134 | /* | 134 | /* |
135 | * Optimized for just a single item in the list (as that is the normal case). | 135 | * Optimized for just a single item in the list (as that is the normal case). |
136 | */ | 136 | */ |
137 | #define while_for_each_ftrace_op(op) \ | 137 | #define while_for_each_ftrace_op(op) \ |
138 | while (likely(op = rcu_dereference_raw((op)->next)) && \ | 138 | while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ |
139 | unlikely((op) != &ftrace_list_end)) | 139 | unlikely((op) != &ftrace_list_end)) |
140 | 140 | ||
141 | static inline void ftrace_ops_init(struct ftrace_ops *ops) | 141 | static inline void ftrace_ops_init(struct ftrace_ops *ops) |
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) | |||
779 | if (hlist_empty(hhd)) | 779 | if (hlist_empty(hhd)) |
780 | return NULL; | 780 | return NULL; |
781 | 781 | ||
782 | hlist_for_each_entry_rcu(rec, hhd, node) { | 782 | hlist_for_each_entry_rcu_notrace(rec, hhd, node) { |
783 | if (rec->ip == ip) | 783 | if (rec->ip == ip) |
784 | return rec; | 784 | return rec; |
785 | } | 785 | } |
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | |||
1165 | 1165 | ||
1166 | hhd = &hash->buckets[key]; | 1166 | hhd = &hash->buckets[key]; |
1167 | 1167 | ||
1168 | hlist_for_each_entry_rcu(entry, hhd, hlist) { | 1168 | hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { |
1169 | if (entry->ip == ip) | 1169 | if (entry->ip == ip) |
1170 | return entry; | 1170 | return entry; |
1171 | } | 1171 | } |
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1422 | struct ftrace_hash *notrace_hash; | 1422 | struct ftrace_hash *notrace_hash; |
1423 | int ret; | 1423 | int ret; |
1424 | 1424 | ||
1425 | filter_hash = rcu_dereference_raw(ops->filter_hash); | 1425 | filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); |
1426 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | 1426 | notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); |
1427 | 1427 | ||
1428 | if ((ftrace_hash_empty(filter_hash) || | 1428 | if ((ftrace_hash_empty(filter_hash) || |
1429 | ftrace_lookup_ip(filter_hash, ip)) && | 1429 | ftrace_lookup_ip(filter_hash, ip)) && |
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
2920 | * on the hash. rcu_read_lock is too dangerous here. | 2920 | * on the hash. rcu_read_lock is too dangerous here. |
2921 | */ | 2921 | */ |
2922 | preempt_disable_notrace(); | 2922 | preempt_disable_notrace(); |
2923 | hlist_for_each_entry_rcu(entry, hhd, node) { | 2923 | hlist_for_each_entry_rcu_notrace(entry, hhd, node) { |
2924 | if (entry->ip == ip) | 2924 | if (entry->ip == ip) |
2925 | entry->ops->func(ip, parent_ip, &entry->data); | 2925 | entry->ops->func(ip, parent_ip, &entry->data); |
2926 | } | 2926 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d79485b3237..e71a8be4a6ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -652,8 +652,6 @@ static struct { | |||
652 | ARCH_TRACE_CLOCKS | 652 | ARCH_TRACE_CLOCKS |
653 | }; | 653 | }; |
654 | 654 | ||
655 | int trace_clock_id; | ||
656 | |||
657 | /* | 655 | /* |
658 | * trace_parser_get_init - gets the buffer for trace parser | 656 | * trace_parser_get_init - gets the buffer for trace parser |
659 | */ | 657 | */ |
@@ -843,7 +841,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
843 | 841 | ||
844 | memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); | 842 | memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); |
845 | max_data->pid = tsk->pid; | 843 | max_data->pid = tsk->pid; |
846 | max_data->uid = task_uid(tsk); | 844 | /* |
845 | * If tsk == current, then use current_uid(), as that does not use | ||
846 | * RCU. The irq tracer can be called out of RCU scope. | ||
847 | */ | ||
848 | if (tsk == current) | ||
849 | max_data->uid = current_uid(); | ||
850 | else | ||
851 | max_data->uid = task_uid(tsk); | ||
852 | |||
847 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; | 853 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; |
848 | max_data->policy = tsk->policy; | 854 | max_data->policy = tsk->policy; |
849 | max_data->rt_priority = tsk->rt_priority; | 855 | max_data->rt_priority = tsk->rt_priority; |
@@ -2818,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
2818 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | 2824 | iter->iter_flags |= TRACE_FILE_ANNOTATE; |
2819 | 2825 | ||
2820 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | 2826 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ |
2821 | if (trace_clocks[trace_clock_id].in_ns) | 2827 | if (trace_clocks[tr->clock_id].in_ns) |
2822 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | 2828 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; |
2823 | 2829 | ||
2824 | /* stop the trace while dumping if we are not opening "snapshot" */ | 2830 | /* stop the trace while dumping if we are not opening "snapshot" */ |
@@ -3817,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3817 | iter->iter_flags |= TRACE_FILE_LAT_FMT; | 3823 | iter->iter_flags |= TRACE_FILE_LAT_FMT; |
3818 | 3824 | ||
3819 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | 3825 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ |
3820 | if (trace_clocks[trace_clock_id].in_ns) | 3826 | if (trace_clocks[tr->clock_id].in_ns) |
3821 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | 3827 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; |
3822 | 3828 | ||
3823 | iter->cpu_file = tc->cpu; | 3829 | iter->cpu_file = tc->cpu; |
@@ -5087,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
5087 | cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); | 5093 | cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); |
5088 | trace_seq_printf(s, "bytes: %ld\n", cnt); | 5094 | trace_seq_printf(s, "bytes: %ld\n", cnt); |
5089 | 5095 | ||
5090 | if (trace_clocks[trace_clock_id].in_ns) { | 5096 | if (trace_clocks[tr->clock_id].in_ns) { |
5091 | /* local or global for trace_clock */ | 5097 | /* local or global for trace_clock */ |
5092 | t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); | 5098 | t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); |
5093 | usec_rem = do_div(t, USEC_PER_SEC); | 5099 | usec_rem = do_div(t, USEC_PER_SEC); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..20572ed88c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); | |||
700 | 700 | ||
701 | extern unsigned long trace_flags; | 701 | extern unsigned long trace_flags; |
702 | 702 | ||
703 | extern int trace_clock_id; | ||
704 | |||
705 | /* Standard output formatting function used for function return traces */ | 703 | /* Standard output formatting function used for function return traces */ |
706 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 704 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
707 | 705 | ||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 55e2cf66967b..2901e3b88590 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | |||
1159 | /* stop the tracing. */ | 1159 | /* stop the tracing. */ |
1160 | tracing_stop(); | 1160 | tracing_stop(); |
1161 | /* check the trace buffer */ | 1161 | /* check the trace buffer */ |
1162 | ret = trace_test_buffer(tr, &count); | 1162 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
1163 | trace->reset(tr); | 1163 | trace->reset(tr); |
1164 | tracing_start(); | 1164 | tracing_start(); |
1165 | 1165 | ||
diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ead..ce0daa320a26 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) | |||
287 | return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; | 287 | return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; |
288 | } | 288 | } |
289 | EXPORT_SYMBOL(bit_waitqueue); | 289 | EXPORT_SYMBOL(bit_waitqueue); |
290 | |||
291 | /* | ||
292 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | ||
293 | * index (we're keying off bit -1, but that would produce a horrible hash | ||
294 | * value). | ||
295 | */ | ||
296 | static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | ||
297 | { | ||
298 | if (BITS_PER_LONG == 64) { | ||
299 | unsigned long q = (unsigned long)p; | ||
300 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
301 | } | ||
302 | return bit_waitqueue(p, 0); | ||
303 | } | ||
304 | |||
305 | static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, | ||
306 | void *arg) | ||
307 | { | ||
308 | struct wait_bit_key *key = arg; | ||
309 | struct wait_bit_queue *wait_bit | ||
310 | = container_of(wait, struct wait_bit_queue, wait); | ||
311 | atomic_t *val = key->flags; | ||
312 | |||
313 | if (wait_bit->key.flags != key->flags || | ||
314 | wait_bit->key.bit_nr != key->bit_nr || | ||
315 | atomic_read(val) != 0) | ||
316 | return 0; | ||
317 | return autoremove_wake_function(wait, mode, sync, key); | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, | ||
322 | * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero | ||
323 | * return codes halt waiting and return. | ||
324 | */ | ||
325 | static __sched | ||
326 | int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
327 | int (*action)(atomic_t *), unsigned mode) | ||
328 | { | ||
329 | atomic_t *val; | ||
330 | int ret = 0; | ||
331 | |||
332 | do { | ||
333 | prepare_to_wait(wq, &q->wait, mode); | ||
334 | val = q->key.flags; | ||
335 | if (atomic_read(val) == 0) | ||
336 | ret = (*action)(val); | ||
337 | } while (!ret && atomic_read(val) != 0); | ||
338 | finish_wait(wq, &q->wait); | ||
339 | return ret; | ||
340 | } | ||
341 | |||
342 | #define DEFINE_WAIT_ATOMIC_T(name, p) \ | ||
343 | struct wait_bit_queue name = { \ | ||
344 | .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ | ||
345 | .wait = { \ | ||
346 | .private = current, \ | ||
347 | .func = wake_atomic_t_function, \ | ||
348 | .task_list = \ | ||
349 | LIST_HEAD_INIT((name).wait.task_list), \ | ||
350 | }, \ | ||
351 | } | ||
352 | |||
353 | __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), | ||
354 | unsigned mode) | ||
355 | { | ||
356 | wait_queue_head_t *wq = atomic_t_waitqueue(p); | ||
357 | DEFINE_WAIT_ATOMIC_T(wait, p); | ||
358 | |||
359 | return __wait_on_atomic_t(wq, &wait, action, mode); | ||
360 | } | ||
361 | EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | ||
362 | |||
363 | /** | ||
364 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | ||
365 | * @word: The word being waited on, a kernel virtual address | ||
366 | * @bit: The bit of the word being waited on | ||
367 | * | ||
368 | * Wake up anyone waiting for the atomic_t to go to zero. | ||
369 | * | ||
370 | * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t | ||
371 | * check is done by the waiter's wake function, not the by the waker itself). | ||
372 | */ | ||
373 | void wake_up_atomic_t(atomic_t *p) | ||
374 | { | ||
375 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | ||
376 | } | ||
377 | EXPORT_SYMBOL(wake_up_atomic_t); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee8e29a2320c..f02c4a4a0c3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; | |||
272 | static bool wq_disable_numa; | 272 | static bool wq_disable_numa; |
273 | module_param_named(disable_numa, wq_disable_numa, bool, 0444); | 273 | module_param_named(disable_numa, wq_disable_numa, bool, 0444); |
274 | 274 | ||
275 | /* see the comment above the definition of WQ_POWER_EFFICIENT */ | ||
276 | #ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT | ||
277 | static bool wq_power_efficient = true; | ||
278 | #else | ||
279 | static bool wq_power_efficient; | ||
280 | #endif | ||
281 | |||
282 | module_param_named(power_efficient, wq_power_efficient, bool, 0444); | ||
283 | |||
275 | static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ | 284 | static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ |
276 | 285 | ||
277 | /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ | 286 | /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ |
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; | |||
305 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 314 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
306 | struct workqueue_struct *system_freezable_wq __read_mostly; | 315 | struct workqueue_struct *system_freezable_wq __read_mostly; |
307 | EXPORT_SYMBOL_GPL(system_freezable_wq); | 316 | EXPORT_SYMBOL_GPL(system_freezable_wq); |
317 | struct workqueue_struct *system_power_efficient_wq __read_mostly; | ||
318 | EXPORT_SYMBOL_GPL(system_power_efficient_wq); | ||
319 | struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; | ||
320 | EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | ||
308 | 321 | ||
309 | static int worker_thread(void *__worker); | 322 | static int worker_thread(void *__worker); |
310 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 323 | static void copy_workqueue_attrs(struct workqueue_attrs *to, |
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4086 | struct workqueue_struct *wq; | 4099 | struct workqueue_struct *wq; |
4087 | struct pool_workqueue *pwq; | 4100 | struct pool_workqueue *pwq; |
4088 | 4101 | ||
4102 | /* see the comment above the definition of WQ_POWER_EFFICIENT */ | ||
4103 | if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) | ||
4104 | flags |= WQ_UNBOUND; | ||
4105 | |||
4089 | /* allocate wq and format name */ | 4106 | /* allocate wq and format name */ |
4090 | if (flags & WQ_UNBOUND) | 4107 | if (flags & WQ_UNBOUND) |
4091 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); | 4108 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); |
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void) | |||
4985 | WQ_UNBOUND_MAX_ACTIVE); | 5002 | WQ_UNBOUND_MAX_ACTIVE); |
4986 | system_freezable_wq = alloc_workqueue("events_freezable", | 5003 | system_freezable_wq = alloc_workqueue("events_freezable", |
4987 | WQ_FREEZABLE, 0); | 5004 | WQ_FREEZABLE, 0); |
5005 | system_power_efficient_wq = alloc_workqueue("events_power_efficient", | ||
5006 | WQ_POWER_EFFICIENT, 0); | ||
5007 | system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", | ||
5008 | WQ_FREEZABLE | WQ_POWER_EFFICIENT, | ||
5009 | 0); | ||
4988 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || | 5010 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || |
4989 | !system_unbound_wq || !system_freezable_wq); | 5011 | !system_unbound_wq || !system_freezable_wq || |
5012 | !system_power_efficient_wq || | ||
5013 | !system_freezable_power_efficient_wq); | ||
4990 | return 0; | 5014 | return 0; |
4991 | } | 5015 | } |
4992 | early_initcall(init_workqueues); | 5016 | early_initcall(init_workqueues); |
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index ad83c96b2ece..7e2204db0b1a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h | |||
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void) | |||
64 | 64 | ||
65 | /* | 65 | /* |
66 | * Scheduler hooks for concurrency managed workqueue. Only to be used from | 66 | * Scheduler hooks for concurrency managed workqueue. Only to be used from |
67 | * sched.c and workqueue.c. | 67 | * sched/core.c and workqueue.c. |
68 | */ | 68 | */ |
69 | void wq_worker_waking_up(struct task_struct *task, int cpu); | 69 | void wq_worker_waking_up(struct task_struct *task, int cpu); |
70 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); | 70 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); |