aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks6
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/context_tracking.c41
-rw-r--r--kernel/cpu.c55
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/events/core.c511
-rw-r--r--kernel/events/hw_breakpoint.c193
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/generic-chip.c314
-rw-r--r--kernel/irq/irqdomain.c17
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/mutex.c384
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/printk.c91
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/range.c21
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutiny.c21
-rw-r--r--kernel/rcutiny_plugin.h1009
-rw-r--r--kernel/rcutorture.c39
-rw-r--r--kernel/rcutree.c189
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h81
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rtmutex.c13
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c660
-rw-r--r--kernel/sched/cputime.c11
-rw-r--r--kernel/sched/debug.c37
-rw-r--r--kernel/sched/fair.c175
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c132
-rw-r--r--kernel/sched/sched.h71
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/stop_task.c8
-rw-r--r--kernel/softirq.c23
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/ntp.c1
-rw-r--r--kernel/time/tick-broadcast.c19
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/trace/ftrace.c18
-rw-r--r--kernel/trace/trace.c18
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/wait.c88
-rw-r--r--kernel/workqueue.c26
-rw-r--r--kernel/workqueue_internal.h2
57 files changed, 2590 insertions, 2529 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100eaa..d2b32ac27a39 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH
138 138
139config INLINE_SPIN_UNLOCK_IRQ 139config INLINE_SPIN_UNLOCK_IRQ
140 def_bool y 140 def_bool y
141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH 141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
142 142
143config INLINE_SPIN_UNLOCK_IRQRESTORE 143config INLINE_SPIN_UNLOCK_IRQRESTORE
144 def_bool y 144 def_bool y
@@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH
175 175
176config INLINE_READ_UNLOCK_IRQ 176config INLINE_READ_UNLOCK_IRQ
177 def_bool y 177 def_bool y
178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH 178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
179 179
180config INLINE_READ_UNLOCK_IRQRESTORE 180config INLINE_READ_UNLOCK_IRQRESTORE
181 def_bool y 181 def_bool y
@@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH
212 212
213config INLINE_WRITE_UNLOCK_IRQ 213config INLINE_WRITE_UNLOCK_IRQ
214 def_bool y 214 def_bool y
215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH 215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
216 216
217config INLINE_WRITE_UNLOCK_IRQRESTORE 217config INLINE_WRITE_UNLOCK_IRQRESTORE
218 def_bool y 218 def_bool y
diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd3..91e53d04b6a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1056static void wait_for_auditd(unsigned long sleep_time) 1056static void wait_for_auditd(unsigned long sleep_time)
1057{ 1057{
1058 DECLARE_WAITQUEUE(wait, current); 1058 DECLARE_WAITQUEUE(wait, current);
1059 set_current_state(TASK_INTERRUPTIBLE); 1059 set_current_state(TASK_UNINTERRUPTIBLE);
1060 add_wait_queue(&audit_backlog_wait, &wait); 1060 add_wait_queue(&audit_backlog_wait, &wait);
1061 1061
1062 if (audit_backlog_limit && 1062 if (audit_backlog_limit &&
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..43c307dc9453 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
658 struct vfsmount *mnt; 658 struct vfsmount *mnt;
659 int err; 659 int err;
660 660
661 rule->tree = NULL;
661 list_for_each_entry(tree, &tree_list, list) { 662 list_for_each_entry(tree, &tree_list, list) {
662 if (!strcmp(seed->pathname, tree->pathname)) { 663 if (!strcmp(seed->pathname, tree->pathname)) {
663 put_tree(seed); 664 put_tree(seed);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..383f8231e436 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
15 */ 15 */
16 16
17#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
19#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -71,6 +70,46 @@ void user_enter(void)
71 local_irq_restore(flags); 70 local_irq_restore(flags);
72} 71}
73 72
73#ifdef CONFIG_PREEMPT
74/**
75 * preempt_schedule_context - preempt_schedule called by tracing
76 *
77 * The tracing infrastructure uses preempt_enable_notrace to prevent
78 * recursion and tracing preempt enabling caused by the tracing
79 * infrastructure itself. But as tracing can happen in areas coming
80 * from userspace or just about to enter userspace, a preempt enable
81 * can occur before user_exit() is called. This will cause the scheduler
82 * to be called when the system is still in usermode.
83 *
84 * To prevent this, the preempt_enable_notrace will use this function
85 * instead of preempt_schedule() to exit user context if needed before
86 * calling the scheduler.
87 */
88void __sched notrace preempt_schedule_context(void)
89{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx;
92
93 if (likely(ti->preempt_count || irqs_disabled()))
94 return;
95
96 /*
97 * Need to disable preemption in case user_exit() is traced
98 * and the tracer calls preempt_enable_notrace() causing
99 * an infinite recursion.
100 */
101 preempt_disable_notrace();
102 prev_ctx = exception_enter();
103 preempt_enable_no_resched_notrace();
104
105 preempt_schedule();
106
107 preempt_disable_notrace();
108 exception_exit(prev_ctx);
109 preempt_enable_notrace();
110}
111EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */
74 113
75/** 114/**
76 * user_exit - Inform the context tracking that the CPU is 115 * user_exit - Inform the context tracking that the CPU is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427e..198a38883e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
134} 134}
135 135
136/*
137 * Wait for currently running CPU hotplug operations to complete (if any) and
138 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
139 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
140 * hotplug path before performing hotplug operations. So acquiring that lock
141 * guarantees mutual exclusion from any currently running hotplug operations.
142 */
143void cpu_hotplug_disable(void)
144{
145 cpu_maps_update_begin();
146 cpu_hotplug_disabled = 1;
147 cpu_maps_update_done();
148}
149
150void cpu_hotplug_enable(void)
151{
152 cpu_maps_update_begin();
153 cpu_hotplug_disabled = 0;
154 cpu_maps_update_done();
155}
156
136#else /* #if CONFIG_HOTPLUG_CPU */ 157#else /* #if CONFIG_HOTPLUG_CPU */
137static void cpu_hotplug_begin(void) {} 158static void cpu_hotplug_begin(void) {}
138static void cpu_hotplug_done(void) {} 159static void cpu_hotplug_done(void) {}
@@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void)
541core_initcall(alloc_frozen_cpus); 562core_initcall(alloc_frozen_cpus);
542 563
543/* 564/*
544 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
545 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
546 * to continue until any currently running CPU hotplug operation gets
547 * completed.
548 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
549 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
550 * CPU hotplug path and released only after it is complete. Thus, we
551 * (and hence the freezer) will block here until any currently running CPU
552 * hotplug operation gets completed.
553 */
554void cpu_hotplug_disable_before_freeze(void)
555{
556 cpu_maps_update_begin();
557 cpu_hotplug_disabled = 1;
558 cpu_maps_update_done();
559}
560
561
562/*
563 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
564 * disabled while beginning to freeze tasks).
565 */
566void cpu_hotplug_enable_after_thaw(void)
567{
568 cpu_maps_update_begin();
569 cpu_hotplug_disabled = 0;
570 cpu_maps_update_done();
571}
572
573/*
574 * When callbacks for CPU hotplug notifications are being executed, we must 565 * When callbacks for CPU hotplug notifications are being executed, we must
575 * ensure that the state of the system with respect to the tasks being frozen 566 * ensure that the state of the system with respect to the tasks being frozen
576 * or not, as reported by the notification, remains unchanged *throughout the 567 * or not, as reported by the notification, remains unchanged *throughout the
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
589 580
590 case PM_SUSPEND_PREPARE: 581 case PM_SUSPEND_PREPARE:
591 case PM_HIBERNATION_PREPARE: 582 case PM_HIBERNATION_PREPARE:
592 cpu_hotplug_disable_before_freeze(); 583 cpu_hotplug_disable();
593 break; 584 break;
594 585
595 case PM_POST_SUSPEND: 586 case PM_POST_SUSPEND:
596 case PM_POST_HIBERNATION: 587 case PM_POST_HIBERNATION:
597 cpu_hotplug_enable_after_thaw(); 588 cpu_hotplug_enable();
598 break; 589 break;
599 590
600 default: 591 default:
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index d5585f5e038e..e695c0a0bcb5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/tick.h> 6#include <linux/tick.h>
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/stackprotector.h>
8 9
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10 11
@@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
58void __weak arch_cpu_idle(void) 59void __weak arch_cpu_idle(void)
59{ 60{
60 cpu_idle_force_poll = 1; 61 cpu_idle_force_poll = 1;
62 local_irq_enable();
61} 63}
62 64
63/* 65/*
@@ -112,6 +114,21 @@ static void cpu_idle_loop(void)
112 114
113void cpu_startup_entry(enum cpuhp_state state) 115void cpu_startup_entry(enum cpuhp_state state)
114{ 116{
117 /*
118 * This #ifdef needs to die, but it's too late in the cycle to
119 * make this generic (arm and sh have never invoked the canary
120 * init for the non boot cpus!). Will be fixed in 3.11
121 */
122#ifdef CONFIG_X86
123 /*
124 * If we're the non-boot CPU, nothing set the stack canary up
125 * for us. The boot CPU already has it initialized but no harm
126 * in doing it again. This is a good place for updating it, as
127 * we wont ever return from this function (so the invalid
128 * canaries already on the stack wont ever trigger).
129 */
130 boot_init_stack_canary();
131#endif
115 current_set_polling(); 132 current_set_polling();
116 arch_cpu_idle_prepare(); 133 arch_cpu_idle_prepare();
117 cpu_idle_loop(); 134 cpu_idle_loop();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..902d13fc2b13 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
540 * This function builds a partial partition of the systems CPUs 540 * This function builds a partial partition of the systems CPUs
541 * A 'partial partition' is a set of non-overlapping subsets whose 541 * A 'partial partition' is a set of non-overlapping subsets whose
542 * union is a subset of that set. 542 * union is a subset of that set.
543 * The output of this function needs to be passed to kernel/sched.c 543 * The output of this function needs to be passed to kernel/sched/core.c
544 * partition_sched_domains() routine, which will rebuild the scheduler's 544 * partition_sched_domains() routine, which will rebuild the scheduler's
545 * load balancing domains (sched domains) as specified by that partial 545 * load balancing domains (sched domains) as specified by that partial
546 * partition. 546 * partition.
@@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
569 * is a subset of one of these domains, while there are as 569 * is a subset of one of these domains, while there are as
570 * many such domains as possible, each as small as possible. 570 * many such domains as possible, each as small as possible.
571 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 571 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
572 * the kernel/sched.c routine partition_sched_domains() in a 572 * the kernel/sched/core.c routine partition_sched_domains() in a
573 * convenient format, that can be easily compared to the prior 573 * convenient format, that can be easily compared to the prior
574 * value to determine what partition elements (sched domains) 574 * value to determine what partition elements (sched domains)
575 * were changed (added or removed.) 575 * were changed (added or removed.)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
165/* 165/*
166 * max perf event sample rate 166 * max perf event sample rate
167 */ 167 */
168#define DEFAULT_MAX_SAMPLE_RATE 100000 168#define DEFAULT_MAX_SAMPLE_RATE 100000
169int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 169#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170static int max_samples_per_tick __read_mostly = 170#define DEFAULT_CPU_TIME_MAX_PERCENT 25
171 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 171
172int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180void update_perf_cpu_limits(void)
181{
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187}
188
189static int perf_rotate_context(struct perf_cpu_context *cpuctx);
172 190
173int perf_proc_update_handler(struct ctl_table *table, int write, 191int perf_proc_update_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 192 void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
180 return ret; 198 return ret;
181 199
182 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 200 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
201 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
202 update_perf_cpu_limits();
183 203
184 return 0; 204 return 0;
185} 205}
186 206
207int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
208
209int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
213 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
214
215 if (ret || !write)
216 return ret;
217
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223/*
224 * perf samples are done in some very critical code paths (NMIs).
225 * If they take too much CPU time, the system can lock up and not
226 * get any real work done. This will drop the sample rate when
227 * we detect that events are taking too long.
228 */
229#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length);
231
232void perf_sample_event_took(u64 sample_len_ns)
233{
234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length);
236
237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return;
239
240 /* decay the counter by 1 average sample */
241 local_samples_len = __get_cpu_var(running_sample_length);
242 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
243 local_samples_len += sample_len_ns;
244 __get_cpu_var(running_sample_length) = local_samples_len;
245
246 /*
247 * note: this will be biased artifically low until we have
248 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
249 * from having to maintain a count.
250 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
254 return;
255
256 if (max_samples_per_tick <= 1)
257 return;
258
259 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
260 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262
263 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate);
269
270 update_perf_cpu_limits();
271}
272
187static atomic64_t perf_event_id; 273static atomic64_t perf_event_id;
188 274
189static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 275static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 282static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 283static u64 perf_event_time(struct perf_event *event);
198 284
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 285void __weak perf_event_print_debug(void) { }
203 286
204extern __weak const char *perf_pmu_name(void) 287extern __weak const char *perf_pmu_name(void)
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
658} 741}
659#endif 742#endif
660 743
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
752static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
775/* CPU is going down */
776void perf_cpu_hrtimer_cancel(int cpu)
777{
778 struct perf_cpu_context *cpuctx;
779 struct pmu *pmu;
780 unsigned long flags;
781
782 if (WARN_ON(cpu != smp_processor_id()))
783 return;
784
785 local_irq_save(flags);
786
787 rcu_read_lock();
788
789 list_for_each_entry_rcu(pmu, &pmus, entry) {
790 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
791
792 if (pmu->task_ctx_nr == perf_sw_context)
793 continue;
794
795 hrtimer_cancel(&cpuctx->hrtimer);
796 }
797
798 rcu_read_unlock();
799
800 local_irq_restore(flags);
801}
802
803static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
804{
805 struct hrtimer *hr = &cpuctx->hrtimer;
806 struct pmu *pmu = cpuctx->ctx.pmu;
807 int timer;
808
809 /* no multiplexing needed for SW PMU */
810 if (pmu->task_ctx_nr == perf_sw_context)
811 return;
812
813 /*
814 * check default is sane, if not set then force to
815 * default interval (1/tick)
816 */
817 timer = pmu->hrtimer_interval_ms;
818 if (timer < 1)
819 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
820
821 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
822
823 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
824 hr->function = perf_cpu_hrtimer_handler;
825}
826
827static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
828{
829 struct hrtimer *hr = &cpuctx->hrtimer;
830 struct pmu *pmu = cpuctx->ctx.pmu;
831
832 /* not for SW PMU */
833 if (pmu->task_ctx_nr == perf_sw_context)
834 return;
835
836 if (hrtimer_active(hr))
837 return;
838
839 if (!hrtimer_callback_running(hr))
840 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
841 0, HRTIMER_MODE_REL_PINNED, 0);
842}
843
661void perf_pmu_disable(struct pmu *pmu) 844void perf_pmu_disable(struct pmu *pmu)
662{ 845{
663 int *count = this_cpu_ptr(pmu->pmu_disable_count); 846 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
1506 1689
1507 if (event_sched_in(group_event, cpuctx, ctx)) { 1690 if (event_sched_in(group_event, cpuctx, ctx)) {
1508 pmu->cancel_txn(pmu); 1691 pmu->cancel_txn(pmu);
1692 perf_cpu_hrtimer_restart(cpuctx);
1509 return -EAGAIN; 1693 return -EAGAIN;
1510 } 1694 }
1511 1695
@@ -1552,6 +1736,8 @@ group_error:
1552 1736
1553 pmu->cancel_txn(pmu); 1737 pmu->cancel_txn(pmu);
1554 1738
1739 perf_cpu_hrtimer_restart(cpuctx);
1740
1555 return -EAGAIN; 1741 return -EAGAIN;
1556} 1742}
1557 1743
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info)
1807 * If this event can't go on and it's part of a 1993 * If this event can't go on and it's part of a
1808 * group, then the whole group has to come off. 1994 * group, then the whole group has to come off.
1809 */ 1995 */
1810 if (leader != event) 1996 if (leader != event) {
1811 group_sched_out(leader, cpuctx, ctx); 1997 group_sched_out(leader, cpuctx, ctx);
1998 perf_cpu_hrtimer_restart(cpuctx);
1999 }
1812 if (leader->attr.pinned) { 2000 if (leader->attr.pinned) {
1813 update_group_times(leader); 2001 update_group_times(leader);
1814 leader->state = PERF_EVENT_STATE_ERROR; 2002 leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2555 * because they're strictly cpu affine and rotate_start is called with IRQs 2743 * because they're strictly cpu affine and rotate_start is called with IRQs
2556 * disabled, while rotate_context is called from IRQ context. 2744 * disabled, while rotate_context is called from IRQ context.
2557 */ 2745 */
2558static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2746static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2559{ 2747{
2560 struct perf_event_context *ctx = NULL; 2748 struct perf_event_context *ctx = NULL;
2561 int rotate = 0, remove = 1; 2749 int rotate = 0, remove = 1;
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2594done: 2782done:
2595 if (remove) 2783 if (remove)
2596 list_del_init(&cpuctx->rotation_list); 2784 list_del_init(&cpuctx->rotation_list);
2785
2786 return rotate;
2597} 2787}
2598 2788
2599#ifdef CONFIG_NO_HZ_FULL 2789#ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void)
2625 ctx = cpuctx->task_ctx; 2815 ctx = cpuctx->task_ctx;
2626 if (ctx) 2816 if (ctx)
2627 perf_adjust_freq_unthr_context(ctx, throttled); 2817 perf_adjust_freq_unthr_context(ctx, throttled);
2628
2629 if (cpuctx->jiffies_interval == 1 ||
2630 !(jiffies % cpuctx->jiffies_interval))
2631 perf_rotate_context(cpuctx);
2632 } 2818 }
2633} 2819}
2634 2820
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head)
2918} 3104}
2919 3105
2920static void ring_buffer_put(struct ring_buffer *rb); 3106static void ring_buffer_put(struct ring_buffer *rb);
3107static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 3108
2922static void free_event(struct perf_event *event) 3109static void free_event(struct perf_event *event)
2923{ 3110{
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 3129 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 3130 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 3131 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 3132 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 3133 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 3134 event->cpu));
3135 }
2948 } 3136 }
2949 } 3137 }
2950 3138
2951 if (event->rb) { 3139 if (event->rb) {
2952 ring_buffer_put(event->rb); 3140 struct ring_buffer *rb;
2953 event->rb = NULL; 3141
3142 /*
3143 * Can happen when we close an event with re-directed output.
3144 *
3145 * Since we have a 0 refcount, perf_mmap_close() will skip
3146 * over us; possibly making our ring_buffer_put() the last.
3147 */
3148 mutex_lock(&event->mmap_mutex);
3149 rb = event->rb;
3150 if (rb) {
3151 rcu_assign_pointer(event->rb, NULL);
3152 ring_buffer_detach(event, rb);
3153 ring_buffer_put(rb); /* could be last */
3154 }
3155 mutex_unlock(&event->mmap_mutex);
2954 } 3156 }
2955 3157
2956 if (is_cgroup_event(event)) 3158 if (is_cgroup_event(event))
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3390 unsigned int events = POLL_HUP;
3189 3391
3190 /* 3392 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3393 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3394 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3395 */
3205 mutex_lock(&event->mmap_mutex); 3396 mutex_lock(&event->mmap_mutex);
3206 3397 rb = event->rb;
3207 rcu_read_lock(); 3398 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3399 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3400 mutex_unlock(&event->mmap_mutex);
3216 3401
3217 poll_wait(file, &event->waitq, wait); 3402 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3706 return;
3522 3707
3523 spin_lock_irqsave(&rb->event_lock, flags); 3708 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3709 if (list_empty(&event->rb_entry))
3525 goto unlock; 3710 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3711 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3712}
3531 3713
3532static void ring_buffer_detach(struct perf_event *event, 3714static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3715{
3535 unsigned long flags; 3716 unsigned long flags;
3536 3717
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3730
3550 rcu_read_lock(); 3731 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3732 rb = rcu_dereference(event->rb);
3552 if (!rb) 3733 if (rb) {
3553 goto unlock; 3734 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3735 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3736 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3737 rcu_read_unlock();
3560} 3738}
3561 3739
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3584 3762
3585static void ring_buffer_put(struct ring_buffer *rb) 3763static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3764{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3765 if (!atomic_dec_and_test(&rb->refcount))
3591 return; 3766 return;
3592 3767
3593 spin_lock_irqsave(&rb->event_lock, flags); 3768 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3769
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3770 call_rcu(&rb->rcu_head, rb_free_rcu);
3601} 3771}
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3605 struct perf_event *event = vma->vm_file->private_data; 3775 struct perf_event *event = vma->vm_file->private_data;
3606 3776
3607 atomic_inc(&event->mmap_count); 3777 atomic_inc(&event->mmap_count);
3778 atomic_inc(&event->rb->mmap_count);
3608} 3779}
3609 3780
3781/*
3782 * A buffer can be mmap()ed multiple times; either directly through the same
3783 * event, or through other events by use of perf_event_set_output().
3784 *
3785 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3786 * the buffer here, where we still have a VM context. This means we need
3787 * to detach all events redirecting to us.
3788 */
3610static void perf_mmap_close(struct vm_area_struct *vma) 3789static void perf_mmap_close(struct vm_area_struct *vma)
3611{ 3790{
3612 struct perf_event *event = vma->vm_file->private_data; 3791 struct perf_event *event = vma->vm_file->private_data;
3613 3792
3614 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3793 struct ring_buffer *rb = event->rb;
3615 unsigned long size = perf_data_size(event->rb); 3794 struct user_struct *mmap_user = rb->mmap_user;
3616 struct user_struct *user = event->mmap_user; 3795 int mmap_locked = rb->mmap_locked;
3617 struct ring_buffer *rb = event->rb; 3796 unsigned long size = perf_data_size(rb);
3797
3798 atomic_dec(&rb->mmap_count);
3799
3800 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3801 return;
3618 3802
3619 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3803 /* Detach current event from the buffer. */
3620 vma->vm_mm->pinned_vm -= event->mmap_locked; 3804 rcu_assign_pointer(event->rb, NULL);
3621 rcu_assign_pointer(event->rb, NULL); 3805 ring_buffer_detach(event, rb);
3622 ring_buffer_detach(event, rb); 3806 mutex_unlock(&event->mmap_mutex);
3807
3808 /* If there's still other mmap()s of this buffer, we're done. */
3809 if (atomic_read(&rb->mmap_count)) {
3810 ring_buffer_put(rb); /* can't be last */
3811 return;
3812 }
3813
3814 /*
3815 * No other mmap()s, detach from all other events that might redirect
3816 * into the now unreachable buffer. Somewhat complicated by the
3817 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3818 */
3819again:
3820 rcu_read_lock();
3821 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3822 if (!atomic_long_inc_not_zero(&event->refcount)) {
3823 /*
3824 * This event is en-route to free_event() which will
3825 * detach it and remove it from the list.
3826 */
3827 continue;
3828 }
3829 rcu_read_unlock();
3830
3831 mutex_lock(&event->mmap_mutex);
3832 /*
3833 * Check we didn't race with perf_event_set_output() which can
3834 * swizzle the rb from under us while we were waiting to
3835 * acquire mmap_mutex.
3836 *
3837 * If we find a different rb; ignore this event, a next
3838 * iteration will no longer find it on the list. We have to
3839 * still restart the iteration to make sure we're not now
3840 * iterating the wrong list.
3841 */
3842 if (event->rb == rb) {
3843 rcu_assign_pointer(event->rb, NULL);
3844 ring_buffer_detach(event, rb);
3845 ring_buffer_put(rb); /* can't be last, we still have one */
3846 }
3623 mutex_unlock(&event->mmap_mutex); 3847 mutex_unlock(&event->mmap_mutex);
3848 put_event(event);
3624 3849
3625 ring_buffer_put(rb); 3850 /*
3626 free_uid(user); 3851 * Restart the iteration; either we're on the wrong list or
3852 * destroyed its integrity by doing a deletion.
3853 */
3854 goto again;
3627 } 3855 }
3856 rcu_read_unlock();
3857
3858 /*
3859 * It could be there's still a few 0-ref events on the list; they'll
3860 * get cleaned up by free_event() -- they'll also still have their
3861 * ref on the rb and will free it whenever they are done with it.
3862 *
3863 * Aside from that, this buffer is 'fully' detached and unmapped,
3864 * undo the VM accounting.
3865 */
3866
3867 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3868 vma->vm_mm->pinned_vm -= mmap_locked;
3869 free_uid(mmap_user);
3870
3871 ring_buffer_put(rb); /* could be last */
3628} 3872}
3629 3873
3630static const struct vm_operations_struct perf_mmap_vmops = { 3874static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3674 return -EINVAL; 3918 return -EINVAL;
3675 3919
3676 WARN_ON_ONCE(event->ctx->parent_ctx); 3920 WARN_ON_ONCE(event->ctx->parent_ctx);
3921again:
3677 mutex_lock(&event->mmap_mutex); 3922 mutex_lock(&event->mmap_mutex);
3678 if (event->rb) { 3923 if (event->rb) {
3679 if (event->rb->nr_pages == nr_pages) 3924 if (event->rb->nr_pages != nr_pages) {
3680 atomic_inc(&event->rb->refcount);
3681 else
3682 ret = -EINVAL; 3925 ret = -EINVAL;
3926 goto unlock;
3927 }
3928
3929 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3930 /*
3931 * Raced against perf_mmap_close() through
3932 * perf_event_set_output(). Try again, hope for better
3933 * luck.
3934 */
3935 mutex_unlock(&event->mmap_mutex);
3936 goto again;
3937 }
3938
3683 goto unlock; 3939 goto unlock;
3684 } 3940 }
3685 3941
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3720 ret = -ENOMEM; 3976 ret = -ENOMEM;
3721 goto unlock; 3977 goto unlock;
3722 } 3978 }
3723 rcu_assign_pointer(event->rb, rb); 3979
3980 atomic_set(&rb->mmap_count, 1);
3981 rb->mmap_locked = extra;
3982 rb->mmap_user = get_current_user();
3724 3983
3725 atomic_long_add(user_extra, &user->locked_vm); 3984 atomic_long_add(user_extra, &user->locked_vm);
3726 event->mmap_locked = extra; 3985 vma->vm_mm->pinned_vm += extra;
3727 event->mmap_user = get_current_user(); 3986
3728 vma->vm_mm->pinned_vm += event->mmap_locked; 3987 ring_buffer_attach(event, rb);
3988 rcu_assign_pointer(event->rb, rb);
3729 3989
3730 perf_event_update_userpage(event); 3990 perf_event_update_userpage(event);
3731 3991
@@ -3734,7 +3994,11 @@ unlock:
3734 atomic_inc(&event->mmap_count); 3994 atomic_inc(&event->mmap_count);
3735 mutex_unlock(&event->mmap_mutex); 3995 mutex_unlock(&event->mmap_mutex);
3736 3996
3737 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3997 /*
3998 * Since pinned accounting is per vm we cannot allow fork() to copy our
3999 * vma.
4000 */
4001 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3738 vma->vm_ops = &perf_mmap_vmops; 4002 vma->vm_ops = &perf_mmap_vmops;
3739 4003
3740 return ret; 4004 return ret;
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4961 * sign as trigger. 5225 * sign as trigger.
4962 */ 5226 */
4963 5227
4964static u64 perf_swevent_set_period(struct perf_event *event) 5228u64 perf_swevent_set_period(struct perf_event *event)
4965{ 5229{
4966 struct hw_perf_event *hwc = &event->hw; 5230 struct hw_perf_event *hwc = &event->hw;
4967 u64 period = hwc->last_period; 5231 u64 period = hwc->last_period;
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
5904 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6168 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5905} 6169}
5906 6170
6171static ssize_t
6172perf_event_mux_interval_ms_show(struct device *dev,
6173 struct device_attribute *attr,
6174 char *page)
6175{
6176 struct pmu *pmu = dev_get_drvdata(dev);
6177
6178 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6179}
6180
6181static ssize_t
6182perf_event_mux_interval_ms_store(struct device *dev,
6183 struct device_attribute *attr,
6184 const char *buf, size_t count)
6185{
6186 struct pmu *pmu = dev_get_drvdata(dev);
6187 int timer, cpu, ret;
6188
6189 ret = kstrtoint(buf, 0, &timer);
6190 if (ret)
6191 return ret;
6192
6193 if (timer < 1)
6194 return -EINVAL;
6195
6196 /* same value, noting to do */
6197 if (timer == pmu->hrtimer_interval_ms)
6198 return count;
6199
6200 pmu->hrtimer_interval_ms = timer;
6201
6202 /* update all cpuctx for this PMU */
6203 for_each_possible_cpu(cpu) {
6204 struct perf_cpu_context *cpuctx;
6205 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6206 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6207
6208 if (hrtimer_active(&cpuctx->hrtimer))
6209 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6210 }
6211
6212 return count;
6213}
6214
6215#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6216
5907static struct device_attribute pmu_dev_attrs[] = { 6217static struct device_attribute pmu_dev_attrs[] = {
5908 __ATTR_RO(type), 6218 __ATTR_RO(type),
5909 __ATTR_NULL, 6219 __ATTR_RW(perf_event_mux_interval_ms),
6220 __ATTR_NULL,
5910}; 6221};
5911 6222
5912static int pmu_bus_running; 6223static int pmu_bus_running;
@@ -5952,7 +6263,7 @@ free_dev:
5952static struct lock_class_key cpuctx_mutex; 6263static struct lock_class_key cpuctx_mutex;
5953static struct lock_class_key cpuctx_lock; 6264static struct lock_class_key cpuctx_lock;
5954 6265
5955int perf_pmu_register(struct pmu *pmu, char *name, int type) 6266int perf_pmu_register(struct pmu *pmu, const char *name, int type)
5956{ 6267{
5957 int cpu, ret; 6268 int cpu, ret;
5958 6269
@@ -6001,7 +6312,9 @@ skip_type:
6001 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6312 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6002 cpuctx->ctx.type = cpu_context; 6313 cpuctx->ctx.type = cpu_context;
6003 cpuctx->ctx.pmu = pmu; 6314 cpuctx->ctx.pmu = pmu;
6004 cpuctx->jiffies_interval = 1; 6315
6316 __perf_cpu_hrtimer_init(cpuctx, cpu);
6317
6005 INIT_LIST_HEAD(&cpuctx->rotation_list); 6318 INIT_LIST_HEAD(&cpuctx->rotation_list);
6006 cpuctx->unique_pmu = pmu; 6319 cpuctx->unique_pmu = pmu;
6007 } 6320 }
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6327 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6640 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6328 return -EINVAL; 6641 return -EINVAL;
6329 6642
6330 /* kernel level capture: check permissions */
6331 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6332 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6333 return -EACCES;
6334
6335 /* propagate priv level, when not set for branch */ 6643 /* propagate priv level, when not set for branch */
6336 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6644 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6337 6645
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6349 */ 6657 */
6350 attr->branch_sample_type = mask; 6658 attr->branch_sample_type = mask;
6351 } 6659 }
6660 /* privileged levels capture (kernel, hv): check permissions */
6661 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6662 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6663 return -EACCES;
6352 } 6664 }
6353 6665
6354 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 6666 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6412,6 +6724,8 @@ set:
6412 if (atomic_read(&event->mmap_count)) 6724 if (atomic_read(&event->mmap_count))
6413 goto unlock; 6725 goto unlock;
6414 6726
6727 old_rb = event->rb;
6728
6415 if (output_event) { 6729 if (output_event) {
6416 /* get the rb we want to redirect to */ 6730 /* get the rb we want to redirect to */
6417 rb = ring_buffer_get(output_event); 6731 rb = ring_buffer_get(output_event);
@@ -6419,16 +6733,28 @@ set:
6419 goto unlock; 6733 goto unlock;
6420 } 6734 }
6421 6735
6422 old_rb = event->rb;
6423 rcu_assign_pointer(event->rb, rb);
6424 if (old_rb) 6736 if (old_rb)
6425 ring_buffer_detach(event, old_rb); 6737 ring_buffer_detach(event, old_rb);
6738
6739 if (rb)
6740 ring_buffer_attach(event, rb);
6741
6742 rcu_assign_pointer(event->rb, rb);
6743
6744 if (old_rb) {
6745 ring_buffer_put(old_rb);
6746 /*
6747 * Since we detached before setting the new rb, so that we
6748 * could attach the new rb, we could have missed a wakeup.
6749 * Provide it now.
6750 */
6751 wake_up_all(&event->waitq);
6752 }
6753
6426 ret = 0; 6754 ret = 0;
6427unlock: 6755unlock:
6428 mutex_unlock(&event->mmap_mutex); 6756 mutex_unlock(&event->mmap_mutex);
6429 6757
6430 if (old_rb)
6431 ring_buffer_put(old_rb);
6432out: 6758out:
6433 return ret; 6759 return ret;
6434} 6760}
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7387 case CPU_DOWN_PREPARE: 7713 case CPU_DOWN_PREPARE:
7388 perf_event_exit_cpu(cpu); 7714 perf_event_exit_cpu(cpu);
7389 break; 7715 break;
7390
7391 default: 7716 default:
7392 break; 7717 break;
7393 } 7718 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1f..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
46#include <linux/smp.h> 46#include <linux/smp.h>
47 47
48#include <linux/hw_breakpoint.h> 48#include <linux/hw_breakpoint.h>
49
50
51/* 49/*
52 * Constraints data 50 * Constraints data
53 */ 51 */
52struct bp_cpuinfo {
53 /* Number of pinned cpu breakpoints in a cpu */
54 unsigned int cpu_pinned;
55 /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
56 unsigned int *tsk_pinned;
57 /* Number of non-pinned cpu/task breakpoints in a cpu */
58 unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
59};
54 60
55/* Number of pinned cpu breakpoints in a cpu */ 61static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
56static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
57
58/* Number of pinned task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
60
61/* Number of non-pinned cpu/task breakpoints in a cpu */
62static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
63
64static int nr_slots[TYPE_MAX]; 62static int nr_slots[TYPE_MAX];
65 63
64static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
65{
66 return per_cpu_ptr(bp_cpuinfo + type, cpu);
67}
68
66/* Keep track of the breakpoints attached to tasks */ 69/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head); 70static LIST_HEAD(bp_task_head);
68 71
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
96 */ 99 */
97static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) 100static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
98{ 101{
102 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
99 int i; 103 int i;
100 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
101 104
102 for (i = nr_slots[type] - 1; i >= 0; i--) { 105 for (i = nr_slots[type] - 1; i >= 0; i--) {
103 if (tsk_pinned[i] > 0) 106 if (tsk_pinned[i] > 0)
@@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && 124 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type && 125 find_slot_idx(iter) == type &&
123 cpu == iter->cpu) 126 (iter->cpu < 0 || cpu == iter->cpu))
124 count += hw_breakpoint_weight(iter); 127 count += hw_breakpoint_weight(iter);
125 } 128 }
126 129
127 return count; 130 return count;
128} 131}
129 132
133static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
134{
135 if (bp->cpu >= 0)
136 return cpumask_of(bp->cpu);
137 return cpu_possible_mask;
138}
139
130/* 140/*
131 * Report the number of pinned/un-pinned breakpoints we have in 141 * Report the number of pinned/un-pinned breakpoints we have in
132 * a given cpu (cpu > -1) or in all of them (cpu = -1). 142 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
135fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, 145fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
136 enum bp_type_idx type) 146 enum bp_type_idx type)
137{ 147{
138 int cpu = bp->cpu; 148 const struct cpumask *cpumask = cpumask_of_bp(bp);
139 struct task_struct *tsk = bp->hw.bp_target; 149 int cpu;
140
141 if (cpu >= 0) {
142 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
143 if (!tsk)
144 slots->pinned += max_task_bp_pinned(cpu, type);
145 else
146 slots->pinned += task_bp_pinned(cpu, bp, type);
147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
148
149 return;
150 }
151 150
152 for_each_online_cpu(cpu) { 151 for_each_cpu(cpu, cpumask) {
153 unsigned int nr; 152 struct bp_cpuinfo *info = get_bp_info(cpu, type);
153 int nr;
154 154
155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu); 155 nr = info->cpu_pinned;
156 if (!tsk) 156 if (!bp->hw.bp_target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
161 if (nr > slots->pinned) 161 if (nr > slots->pinned)
162 slots->pinned = nr; 162 slots->pinned = nr;
163 163
164 nr = per_cpu(nr_bp_flexible[type], cpu); 164 nr = info->flexible;
165
166 if (nr > slots->flexible) 165 if (nr > slots->flexible)
167 slots->flexible = nr; 166 slots->flexible = nr;
168 } 167 }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
182/* 181/*
183 * Add a pinned breakpoint for the given task in our constraint table 182 * Add a pinned breakpoint for the given task in our constraint table
184 */ 183 */
185static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, 184static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
186 enum bp_type_idx type, int weight) 185 enum bp_type_idx type, int weight)
187{ 186{
188 unsigned int *tsk_pinned; 187 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
189 int old_count = 0; 188 int old_idx, new_idx;
190 int old_idx = 0; 189
191 int idx = 0; 190 old_idx = task_bp_pinned(cpu, bp, type) - 1;
192 191 new_idx = old_idx + weight;
193 old_count = task_bp_pinned(cpu, bp, type); 192
194 old_idx = old_count - 1; 193 if (old_idx >= 0)
195 idx = old_idx + weight; 194 tsk_pinned[old_idx]--;
196 195 if (new_idx >= 0)
197 /* tsk_pinned[n] is the number of tasks having n breakpoints */ 196 tsk_pinned[new_idx]++;
198 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
199 if (enable) {
200 tsk_pinned[idx]++;
201 if (old_count > 0)
202 tsk_pinned[old_idx]--;
203 } else {
204 tsk_pinned[idx]--;
205 if (old_count > 0)
206 tsk_pinned[old_idx]++;
207 }
208} 197}
209 198
210/* 199/*
@@ -214,33 +203,26 @@ static void
214toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, 203toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
215 int weight) 204 int weight)
216{ 205{
217 int cpu = bp->cpu; 206 const struct cpumask *cpumask = cpumask_of_bp(bp);
218 struct task_struct *tsk = bp->hw.bp_target; 207 int cpu;
219 208
220 /* Pinned counter cpu profiling */ 209 if (!enable)
221 if (!tsk) { 210 weight = -weight;
222 211
223 if (enable) 212 /* Pinned counter cpu profiling */
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 213 if (!bp->hw.bp_target) {
225 else 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
226 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
227 return; 215 return;
228 } 216 }
229 217
230 /* Pinned counter task profiling */ 218 /* Pinned counter task profiling */
231 219 for_each_cpu(cpu, cpumask)
232 if (!enable) 220 toggle_bp_task_slot(bp, cpu, type, weight);
233 list_del(&bp->hw.bp_list);
234
235 if (cpu >= 0) {
236 toggle_bp_task_slot(bp, cpu, enable, type, weight);
237 } else {
238 for_each_online_cpu(cpu)
239 toggle_bp_task_slot(bp, cpu, enable, type, weight);
240 }
241 221
242 if (enable) 222 if (enable)
243 list_add_tail(&bp->hw.bp_list, &bp_task_head); 223 list_add_tail(&bp->hw.bp_list, &bp_task_head);
224 else
225 list_del(&bp->hw.bp_list);
244} 226}
245 227
246/* 228/*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
261 * 243 *
262 * - If attached to a single cpu, check: 244 * - If attached to a single cpu, check:
263 * 245 *
264 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 246 * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
265 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM 247 * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
266 * 248 *
267 * -> If there are already non-pinned counters in this cpu, it means 249 * -> If there are already non-pinned counters in this cpu, it means
268 * there is already a free slot for them. 250 * there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
272 * 254 *
273 * - If attached to every cpus, check: 255 * - If attached to every cpus, check:
274 * 256 *
275 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 257 * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
276 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM 258 * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
277 * 259 *
278 * -> This is roughly the same, except we check the number of per cpu 260 * -> This is roughly the same, except we check the number of per cpu
279 * bp for every cpu and we keep the max one. Same for the per tasks 261 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
284 * 266 *
285 * - If attached to a single cpu, check: 267 * - If attached to a single cpu, check:
286 * 268 *
287 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 269 * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
288 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM 270 * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
289 * 271 *
290 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 272 * -> Same checks as before. But now the info->flexible, if any, must keep
291 * one register at least (or they will never be fed). 273 * one register at least (or they will never be fed).
292 * 274 *
293 * - If attached to every cpus, check: 275 * - If attached to every cpus, check:
294 * 276 *
295 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 277 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
296 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 278 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
297 */ 279 */
298static int __reserve_bp_slot(struct perf_event *bp) 280static int __reserve_bp_slot(struct perf_event *bp)
299{ 281{
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
518 perf_overflow_handler_t triggered, 500 perf_overflow_handler_t triggered,
519 void *context) 501 void *context)
520{ 502{
521 struct perf_event * __percpu *cpu_events, **pevent, *bp; 503 struct perf_event * __percpu *cpu_events, *bp;
522 long err; 504 long err = 0;
523 int cpu; 505 int cpu;
524 506
525 cpu_events = alloc_percpu(typeof(*cpu_events)); 507 cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
528 510
529 get_online_cpus(); 511 get_online_cpus();
530 for_each_online_cpu(cpu) { 512 for_each_online_cpu(cpu) {
531 pevent = per_cpu_ptr(cpu_events, cpu);
532 bp = perf_event_create_kernel_counter(attr, cpu, NULL, 513 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
533 triggered, context); 514 triggered, context);
534
535 *pevent = bp;
536
537 if (IS_ERR(bp)) { 515 if (IS_ERR(bp)) {
538 err = PTR_ERR(bp); 516 err = PTR_ERR(bp);
539 goto fail; 517 break;
540 } 518 }
541 }
542 put_online_cpus();
543 519
544 return cpu_events; 520 per_cpu(*cpu_events, cpu) = bp;
545
546fail:
547 for_each_online_cpu(cpu) {
548 pevent = per_cpu_ptr(cpu_events, cpu);
549 if (IS_ERR(*pevent))
550 break;
551 unregister_hw_breakpoint(*pevent);
552 } 521 }
553 put_online_cpus(); 522 put_online_cpus();
554 523
555 free_percpu(cpu_events); 524 if (likely(!err))
525 return cpu_events;
526
527 unregister_wide_hw_breakpoint(cpu_events);
556 return (void __percpu __force *)ERR_PTR(err); 528 return (void __percpu __force *)ERR_PTR(err);
557} 529}
558EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 530EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
564void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) 536void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
565{ 537{
566 int cpu; 538 int cpu;
567 struct perf_event **pevent;
568 539
569 for_each_possible_cpu(cpu) { 540 for_each_possible_cpu(cpu)
570 pevent = per_cpu_ptr(cpu_events, cpu); 541 unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
571 unregister_hw_breakpoint(*pevent); 542
572 }
573 free_percpu(cpu_events); 543 free_percpu(cpu_events);
574} 544}
575EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); 545EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
612 if (!(flags & PERF_EF_START)) 582 if (!(flags & PERF_EF_START))
613 bp->hw.state = PERF_HES_STOPPED; 583 bp->hw.state = PERF_HES_STOPPED;
614 584
585 if (is_sampling_event(bp)) {
586 bp->hw.last_period = bp->hw.sample_period;
587 perf_swevent_set_period(bp);
588 }
589
615 return arch_install_hw_breakpoint(bp); 590 return arch_install_hw_breakpoint(bp);
616} 591}
617 592
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
650 625
651int __init init_hw_breakpoint(void) 626int __init init_hw_breakpoint(void)
652{ 627{
653 unsigned int **task_bp_pinned;
654 int cpu, err_cpu; 628 int cpu, err_cpu;
655 int i; 629 int i;
656 630
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
659 633
660 for_each_possible_cpu(cpu) { 634 for_each_possible_cpu(cpu) {
661 for (i = 0; i < TYPE_MAX; i++) { 635 for (i = 0; i < TYPE_MAX; i++) {
662 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); 636 struct bp_cpuinfo *info = get_bp_info(cpu, i);
663 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], 637
664 GFP_KERNEL); 638 info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
665 if (!*task_bp_pinned) 639 GFP_KERNEL);
640 if (!info->tsk_pinned)
666 goto err_alloc; 641 goto err_alloc;
667 } 642 }
668 } 643 }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 651 err_alloc:
677 for_each_possible_cpu(err_cpu) { 652 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 653 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); 654 kfree(get_bp_info(err_cpu, i)->tsk_pinned);
680 if (err_cpu == cpu) 655 if (err_cpu == cpu)
681 break; 656 break;
682 } 657 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59df..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,10 @@ struct ring_buffer {
31 spinlock_t event_lock; 31 spinlock_t event_lock;
32 struct list_head event_list; 32 struct list_head event_list;
33 33
34 atomic_t mmap_count;
35 unsigned long mmap_locked;
36 struct user_struct *mmap_user;
37
34 struct perf_event_mmap_page *user_page; 38 struct perf_event_mmap_page *user_page;
35 void *data_pages[0]; 39 void *data_pages[0];
36}; 40};
diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..7bb73f9d09db 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
649 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 649 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
650 */ 650 */
651 forget_original_parent(tsk); 651 forget_original_parent(tsk);
652 exit_task_namespaces(tsk);
653 652
654 write_lock_irq(&tasklist_lock); 653 write_lock_irq(&tasklist_lock);
655 if (group_dead) 654 if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
795 exit_shm(tsk); 794 exit_shm(tsk);
796 exit_files(tsk); 795 exit_files(tsk);
797 exit_fs(tsk); 796 exit_fs(tsk);
797 exit_task_namespaces(tsk);
798 exit_task_work(tsk); 798 exit_task_work(tsk);
799 check_stack_usage(); 799 check_stack_usage();
800 exit_thread(); 800 exit_thread();
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c94..c3a1a55a5214 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h>
65#include <linux/freezer.h>
64 66
65#include <asm/futex.h> 67#include <asm/futex.h>
66 68
@@ -365,7 +367,7 @@ again:
365 } else { 367 } else {
366 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 368 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
367 key->shared.inode = page_head->mapping->host; 369 key->shared.inode = page_head->mapping->host;
368 key->shared.pgoff = page_head->index; 370 key->shared.pgoff = basepage_index(page);
369 } 371 }
370 372
371 get_futex_key_refs(key); 373 get_futex_key_refs(key);
@@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1807 * is no timeout, or if it has yet to expire. 1809 * is no timeout, or if it has yet to expire.
1808 */ 1810 */
1809 if (!timeout || timeout->task) 1811 if (!timeout || timeout->task)
1810 schedule(); 1812 freezable_schedule();
1811 } 1813 }
1812 __set_current_state(TASK_RUNNING); 1814 __set_current_state(TASK_RUNNING);
1813} 1815}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cbd97ce0b000..a3bb14fbe5c6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc)
213 irq_state_clr_masked(desc); 213 irq_state_clr_masked(desc);
214} 214}
215 215
216/**
217 * irq_disable - Mark interupt disabled
218 * @desc: irq descriptor which should be disabled
219 *
220 * If the chip does not implement the irq_disable callback, we
221 * use a lazy disable approach. That means we mark the interrupt
222 * disabled, but leave the hardware unmasked. That's an
223 * optimization because we avoid the hardware access for the
224 * common case where no interrupt happens after we marked it
225 * disabled. If an interrupt happens, then the interrupt flow
226 * handler masks the line at the hardware level and marks it
227 * pending.
228 */
216void irq_disable(struct irq_desc *desc) 229void irq_disable(struct irq_desc *desc)
217{ 230{
218 irq_state_set_disabled(desc); 231 irq_state_set_disabled(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f668..1c39eccc1eaf 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/irqdomain.h>
10#include <linux/interrupt.h> 11#include <linux/interrupt.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/syscore_ops.h> 13#include <linux/syscore_ops.h>
@@ -16,11 +17,6 @@
16static LIST_HEAD(gc_list); 17static LIST_HEAD(gc_list);
17static DEFINE_RAW_SPINLOCK(gc_lock); 18static DEFINE_RAW_SPINLOCK(gc_lock);
18 19
19static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
20{
21 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
22}
23
24/** 20/**
25 * irq_gc_noop - NOOP function 21 * irq_gc_noop - NOOP function
26 * @d: irq_data 22 * @d: irq_data
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)
39void irq_gc_mask_disable_reg(struct irq_data *d) 35void irq_gc_mask_disable_reg(struct irq_data *d)
40{ 36{
41 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 37 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
42 u32 mask = 1 << (d->irq - gc->irq_base); 38 struct irq_chip_type *ct = irq_data_get_chip_type(d);
39 u32 mask = d->mask;
43 40
44 irq_gc_lock(gc); 41 irq_gc_lock(gc);
45 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); 42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
46 gc->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
47 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
48} 45}
49 46
50/** 47/**
51 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register 48 * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
52 * @d: irq_data 49 * @d: irq_data
53 * 50 *
54 * Chip has a single mask register. Values of this register are cached 51 * Chip has a single mask register. Values of this register are cached
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
57void irq_gc_mask_set_bit(struct irq_data *d) 54void irq_gc_mask_set_bit(struct irq_data *d)
58{ 55{
59 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 56 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
60 u32 mask = 1 << (d->irq - gc->irq_base); 57 struct irq_chip_type *ct = irq_data_get_chip_type(d);
58 u32 mask = d->mask;
61 59
62 irq_gc_lock(gc); 60 irq_gc_lock(gc);
63 gc->mask_cache |= mask; 61 *ct->mask_cache |= mask;
64 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
65 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
66} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
67 66
68/** 67/**
69 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register 68 * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
70 * @d: irq_data 69 * @d: irq_data
71 * 70 *
72 * Chip has a single mask register. Values of this register are cached 71 * Chip has a single mask register. Values of this register are cached
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)
75void irq_gc_mask_clr_bit(struct irq_data *d) 74void irq_gc_mask_clr_bit(struct irq_data *d)
76{ 75{
77 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
78 u32 mask = 1 << (d->irq - gc->irq_base); 77 struct irq_chip_type *ct = irq_data_get_chip_type(d);
78 u32 mask = d->mask;
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 gc->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
85 86
86/** 87/**
87 * irq_gc_unmask_enable_reg - Unmask chip via enable register 88 * irq_gc_unmask_enable_reg - Unmask chip via enable register
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
93void irq_gc_unmask_enable_reg(struct irq_data *d) 94void irq_gc_unmask_enable_reg(struct irq_data *d)
94{ 95{
95 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 96 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
96 u32 mask = 1 << (d->irq - gc->irq_base); 97 struct irq_chip_type *ct = irq_data_get_chip_type(d);
98 u32 mask = d->mask;
97 99
98 irq_gc_lock(gc); 100 irq_gc_lock(gc);
99 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); 101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
100 gc->mask_cache |= mask; 102 *ct->mask_cache |= mask;
101 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
102} 104}
103 105
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
108void irq_gc_ack_set_bit(struct irq_data *d) 110void irq_gc_ack_set_bit(struct irq_data *d)
109{ 111{
110 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 112 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
111 u32 mask = 1 << (d->irq - gc->irq_base); 113 struct irq_chip_type *ct = irq_data_get_chip_type(d);
114 u32 mask = d->mask;
112 115
113 irq_gc_lock(gc); 116 irq_gc_lock(gc);
114 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
115 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
116} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
117 121
118/** 122/**
119 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit 123 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
@@ -122,10 +126,11 @@ void irq_gc_ack_set_bit(struct irq_data *d)
122void irq_gc_ack_clr_bit(struct irq_data *d) 126void irq_gc_ack_clr_bit(struct irq_data *d)
123{ 127{
124 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 128 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
125 u32 mask = ~(1 << (d->irq - gc->irq_base)); 129 struct irq_chip_type *ct = irq_data_get_chip_type(d);
130 u32 mask = ~d->mask;
126 131
127 irq_gc_lock(gc); 132 irq_gc_lock(gc);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
129 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
130} 135}
131 136
@@ -136,11 +141,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
136void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) 141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
137{ 142{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 143 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base); 144 struct irq_chip_type *ct = irq_data_get_chip_type(d);
145 u32 mask = d->mask;
140 146
141 irq_gc_lock(gc); 147 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); 148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
143 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
144 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
145} 151}
146 152
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
151void irq_gc_eoi(struct irq_data *d) 157void irq_gc_eoi(struct irq_data *d)
152{ 158{
153 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 159 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
154 u32 mask = 1 << (d->irq - gc->irq_base); 160 struct irq_chip_type *ct = irq_data_get_chip_type(d);
161 u32 mask = d->mask;
155 162
156 irq_gc_lock(gc); 163 irq_gc_lock(gc);
157 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); 164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
158 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
159} 166}
160 167
161/** 168/**
162 * irq_gc_set_wake - Set/clr wake bit for an interrupt 169 * irq_gc_set_wake - Set/clr wake bit for an interrupt
163 * @d: irq_data 170 * @d: irq_data
171 * @on: Indicates whether the wake bit should be set or cleared
164 * 172 *
165 * For chips where the wake from suspend functionality is not 173 * For chips where the wake from suspend functionality is not
166 * configured in a separate register and the wakeup active state is 174 * configured in a separate register and the wakeup active state is
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)
169int irq_gc_set_wake(struct irq_data *d, unsigned int on) 177int irq_gc_set_wake(struct irq_data *d, unsigned int on)
170{ 178{
171 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 179 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
172 u32 mask = 1 << (d->irq - gc->irq_base); 180 u32 mask = d->mask;
173 181
174 if (!(mask & gc->wake_enabled)) 182 if (!(mask & gc->wake_enabled))
175 return -EINVAL; 183 return -EINVAL;
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
183 return 0; 191 return 0;
184} 192}
185 193
194static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base,
197 void __iomem *reg_base, irq_flow_handler_t handler)
198{
199 raw_spin_lock_init(&gc->lock);
200 gc->num_ct = num_ct;
201 gc->irq_base = irq_base;
202 gc->reg_base = reg_base;
203 gc->chip_types->chip.name = name;
204 gc->chip_types->handler = handler;
205}
206
186/** 207/**
187 * irq_alloc_generic_chip - Allocate a generic chip and initialize it 208 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
188 * @name: Name of the irq chip 209 * @name: Name of the irq chip
@@ -203,23 +224,185 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
203 224
204 gc = kzalloc(sz, GFP_KERNEL); 225 gc = kzalloc(sz, GFP_KERNEL);
205 if (gc) { 226 if (gc) {
206 raw_spin_lock_init(&gc->lock); 227 irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
207 gc->num_ct = num_ct; 228 handler);
208 gc->irq_base = irq_base;
209 gc->reg_base = reg_base;
210 gc->chip_types->chip.name = name;
211 gc->chip_types->handler = handler;
212 } 229 }
213 return gc; 230 return gc;
214} 231}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); 232EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
216 233
234static void
235irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
239 int i;
240
241 for (i = 0; i < gc->num_ct; i++) {
242 if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
243 mskptr = &ct[i].mask_cache_priv;
244 mskreg = ct[i].regs.mask;
245 }
246 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg);
249 }
250}
251
252/**
253 * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
254 * @d: irq domain for which to allocate chips
255 * @irqs_per_chip: Number of interrupts each chip handles
256 * @num_ct: Number of irq_chip_type instances associated with this
257 * @name: Name of the irq chip
258 * @handler: Default flow handler associated with these chips
259 * @clr: IRQ_* bits to clear in the mapping function
260 * @set: IRQ_* bits to set in the mapping function
261 * @gcflags: Generic chip specific setup flags
262 */
263int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
264 int num_ct, const char *name,
265 irq_flow_handler_t handler,
266 unsigned int clr, unsigned int set,
267 enum irq_gc_flags gcflags)
268{
269 struct irq_domain_chip_generic *dgc;
270 struct irq_chip_generic *gc;
271 int numchips, sz, i;
272 unsigned long flags;
273 void *tmp;
274
275 if (d->gc)
276 return -EBUSY;
277
278 if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR)
279 return -EINVAL;
280
281 numchips = d->revmap_data.linear.size / irqs_per_chip;
282 if (!numchips)
283 return -EINVAL;
284
285 /* Allocate a pointer, generic chip and chiptypes for each chip */
286 sz = sizeof(*dgc) + numchips * sizeof(gc);
287 sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
288
289 tmp = dgc = kzalloc(sz, GFP_KERNEL);
290 if (!dgc)
291 return -ENOMEM;
292 dgc->irqs_per_chip = irqs_per_chip;
293 dgc->num_chips = numchips;
294 dgc->irq_flags_to_set = set;
295 dgc->irq_flags_to_clear = clr;
296 dgc->gc_flags = gcflags;
297 d->gc = dgc;
298
299 /* Calc pointer to the first generic chip */
300 tmp += sizeof(*dgc) + numchips * sizeof(gc);
301 for (i = 0; i < numchips; i++) {
302 /* Store the pointer to the generic chip */
303 dgc->gc[i] = gc = tmp;
304 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
305 NULL, handler);
306 gc->domain = d;
307 raw_spin_lock_irqsave(&gc_lock, flags);
308 list_add_tail(&gc->list, &gc_list);
309 raw_spin_unlock_irqrestore(&gc_lock, flags);
310 /* Calc pointer to the next generic chip */
311 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
312 }
313 return 0;
314}
315EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
316
317/**
318 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
319 * @d: irq domain pointer
320 * @hw_irq: Hardware interrupt number
321 */
322struct irq_chip_generic *
323irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
324{
325 struct irq_domain_chip_generic *dgc = d->gc;
326 int idx;
327
328 if (!dgc)
329 return NULL;
330 idx = hw_irq / dgc->irqs_per_chip;
331 if (idx >= dgc->num_chips)
332 return NULL;
333 return dgc->gc[idx];
334}
335EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
336
217/* 337/*
218 * Separate lockdep class for interrupt chip which can nest irq_desc 338 * Separate lockdep class for interrupt chip which can nest irq_desc
219 * lock. 339 * lock.
220 */ 340 */
221static struct lock_class_key irq_nested_lock_class; 341static struct lock_class_key irq_nested_lock_class;
222 342
343/*
344 * irq_map_generic_chip - Map a generic chip for an irq domain
345 */
346static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
347 irq_hw_number_t hw_irq)
348{
349 struct irq_data *data = irq_get_irq_data(virq);
350 struct irq_domain_chip_generic *dgc = d->gc;
351 struct irq_chip_generic *gc;
352 struct irq_chip_type *ct;
353 struct irq_chip *chip;
354 unsigned long flags;
355 int idx;
356
357 if (!d->gc)
358 return -ENODEV;
359
360 idx = hw_irq / dgc->irqs_per_chip;
361 if (idx >= dgc->num_chips)
362 return -EINVAL;
363 gc = dgc->gc[idx];
364
365 idx = hw_irq % dgc->irqs_per_chip;
366
367 if (test_bit(idx, &gc->unused))
368 return -ENOTSUPP;
369
370 if (test_bit(idx, &gc->installed))
371 return -EBUSY;
372
373 ct = gc->chip_types;
374 chip = &ct->chip;
375
376 /* We only init the cache for the first mapping of a generic chip */
377 if (!gc->installed) {
378 raw_spin_lock_irqsave(&gc->lock, flags);
379 irq_gc_init_mask_cache(gc, dgc->gc_flags);
380 raw_spin_unlock_irqrestore(&gc->lock, flags);
381 }
382
383 /* Mark the interrupt as installed */
384 set_bit(idx, &gc->installed);
385
386 if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
387 irq_set_lockdep_class(virq, &irq_nested_lock_class);
388
389 if (chip->irq_calc_mask)
390 chip->irq_calc_mask(data);
391 else
392 data->mask = 1 << idx;
393
394 irq_set_chip_and_handler(virq, chip, ct->handler);
395 irq_set_chip_data(virq, gc);
396 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
397 return 0;
398}
399
400struct irq_domain_ops irq_generic_chip_ops = {
401 .map = irq_map_generic_chip,
402 .xlate = irq_domain_xlate_onetwocell,
403};
404EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
405
223/** 406/**
224 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip 407 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
225 * @gc: Generic irq chip holding all data 408 * @gc: Generic irq chip holding all data
@@ -237,15 +420,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
237 unsigned int set) 420 unsigned int set)
238{ 421{
239 struct irq_chip_type *ct = gc->chip_types; 422 struct irq_chip_type *ct = gc->chip_types;
423 struct irq_chip *chip = &ct->chip;
240 unsigned int i; 424 unsigned int i;
241 425
242 raw_spin_lock(&gc_lock); 426 raw_spin_lock(&gc_lock);
243 list_add_tail(&gc->list, &gc_list); 427 list_add_tail(&gc->list, &gc_list);
244 raw_spin_unlock(&gc_lock); 428 raw_spin_unlock(&gc_lock);
245 429
246 /* Init mask cache ? */ 430 irq_gc_init_mask_cache(gc, flags);
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
249 431
250 for (i = gc->irq_base; msk; msk >>= 1, i++) { 432 for (i = gc->irq_base; msk; msk >>= 1, i++) {
251 if (!(msk & 0x01)) 433 if (!(msk & 0x01))
@@ -254,7 +436,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
254 if (flags & IRQ_GC_INIT_NESTED_LOCK) 436 if (flags & IRQ_GC_INIT_NESTED_LOCK)
255 irq_set_lockdep_class(i, &irq_nested_lock_class); 437 irq_set_lockdep_class(i, &irq_nested_lock_class);
256 438
257 irq_set_chip_and_handler(i, &ct->chip, ct->handler); 439 if (!(flags & IRQ_GC_NO_MASK)) {
440 struct irq_data *d = irq_get_irq_data(i);
441
442 if (chip->irq_calc_mask)
443 chip->irq_calc_mask(d);
444 else
445 d->mask = 1 << (i - gc->irq_base);
446 }
447 irq_set_chip_and_handler(i, chip, ct->handler);
258 irq_set_chip_data(i, gc); 448 irq_set_chip_data(i, gc);
259 irq_modify_status(i, clr, set); 449 irq_modify_status(i, clr, set);
260 } 450 }
@@ -265,7 +455,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
265/** 455/**
266 * irq_setup_alt_chip - Switch to alternative chip 456 * irq_setup_alt_chip - Switch to alternative chip
267 * @d: irq_data for this interrupt 457 * @d: irq_data for this interrupt
268 * @type Flow type to be initialized 458 * @type: Flow type to be initialized
269 * 459 *
270 * Only to be called from chip->irq_set_type() callbacks. 460 * Only to be called from chip->irq_set_type() callbacks.
271 */ 461 */
@@ -317,6 +507,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
317} 507}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip); 508EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
319 509
510static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
511{
512 unsigned int virq;
513
514 if (!gc->domain)
515 return irq_get_irq_data(gc->irq_base);
516
517 /*
518 * We don't know which of the irqs has been actually
519 * installed. Use the first one.
520 */
521 if (!gc->installed)
522 return NULL;
523
524 virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
525 return virq ? irq_get_irq_data(virq) : NULL;
526}
527
320#ifdef CONFIG_PM 528#ifdef CONFIG_PM
321static int irq_gc_suspend(void) 529static int irq_gc_suspend(void)
322{ 530{
@@ -325,8 +533,12 @@ static int irq_gc_suspend(void)
325 list_for_each_entry(gc, &gc_list, list) { 533 list_for_each_entry(gc, &gc_list, list) {
326 struct irq_chip_type *ct = gc->chip_types; 534 struct irq_chip_type *ct = gc->chip_types;
327 535
328 if (ct->chip.irq_suspend) 536 if (ct->chip.irq_suspend) {
329 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); 537 struct irq_data *data = irq_gc_get_irq_data(gc);
538
539 if (data)
540 ct->chip.irq_suspend(data);
541 }
330 } 542 }
331 return 0; 543 return 0;
332} 544}
@@ -338,8 +550,12 @@ static void irq_gc_resume(void)
338 list_for_each_entry(gc, &gc_list, list) { 550 list_for_each_entry(gc, &gc_list, list) {
339 struct irq_chip_type *ct = gc->chip_types; 551 struct irq_chip_type *ct = gc->chip_types;
340 552
341 if (ct->chip.irq_resume) 553 if (ct->chip.irq_resume) {
342 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); 554 struct irq_data *data = irq_gc_get_irq_data(gc);
555
556 if (data)
557 ct->chip.irq_resume(data);
558 }
343 } 559 }
344} 560}
345#else 561#else
@@ -354,8 +570,12 @@ static void irq_gc_shutdown(void)
354 list_for_each_entry(gc, &gc_list, list) { 570 list_for_each_entry(gc, &gc_list, list) {
355 struct irq_chip_type *ct = gc->chip_types; 571 struct irq_chip_type *ct = gc->chip_types;
356 572
357 if (ct->chip.irq_pm_shutdown) 573 if (ct->chip.irq_pm_shutdown) {
358 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); 574 struct irq_data *data = irq_gc_get_irq_data(gc);
575
576 if (data)
577 ct->chip.irq_pm_shutdown(data);
578 }
359 } 579 }
360} 580}
361 581
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0c..1ed8dff17eb9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -16,12 +16,6 @@
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18 18
19#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
20 * ie. legacy 8259, gets irqs 1..15 */
21#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
22#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
23#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
24
25static LIST_HEAD(irq_domain_list); 19static LIST_HEAD(irq_domain_list);
26static DEFINE_MUTEX(irq_domain_mutex); 20static DEFINE_MUTEX(irq_domain_mutex);
27 21
@@ -143,7 +137,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 137 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node. 138 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping 139 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain 140 * @first_irq: first number of irq block assigned to the domain,
141 * pass zero to assign irqs on-the-fly. This will result in a
142 * linear IRQ domain so it is important to use irq_create_mapping()
143 * for each used IRQ, especially when SPARSE_IRQ is enabled.
147 * @ops: map/unmap domain callbacks 144 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer 145 * @host_data: Controller private data pointer
149 * 146 *
@@ -191,6 +188,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
191 /* A linear domain is the default */ 188 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data); 189 return irq_domain_add_linear(of_node, size, ops, host_data);
193} 190}
191EXPORT_SYMBOL_GPL(irq_domain_add_simple);
194 192
195/** 193/**
196 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 194 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
@@ -397,11 +395,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
397 while (count--) { 395 while (count--) {
398 int irq = irq_base + count; 396 int irq = irq_base + count;
399 struct irq_data *irq_data = irq_get_irq_data(irq); 397 struct irq_data *irq_data = irq_get_irq_data(irq);
400 irq_hw_number_t hwirq = irq_data->hwirq; 398 irq_hw_number_t hwirq;
401 399
402 if (WARN_ON(!irq_data || irq_data->domain != domain)) 400 if (WARN_ON(!irq_data || irq_data->domain != domain))
403 continue; 401 continue;
404 402
403 hwirq = irq_data->hwirq;
405 irq_set_status_flags(irq, IRQ_NOREQUEST); 404 irq_set_status_flags(irq, IRQ_NOREQUEST);
406 405
407 /* remove chip and handler */ 406 /* remove chip and handler */
@@ -693,7 +692,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
693 692
694 /* Set type if specified and different than the current one */ 693 /* Set type if specified and different than the current one */
695 if (type != IRQ_TYPE_NONE && 694 if (type != IRQ_TYPE_NONE &&
696 type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) 695 type != irq_get_trigger_type(virq))
697 irq_set_irq_type(virq, type); 696 irq_set_irq_type(virq, type);
698 return virq; 697 return virq;
699} 698}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa17855ca65a..514bcfd855a8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
555 return 0; 555 return 0;
556 556
557 if (irq_settings_can_request(desc)) { 557 if (irq_settings_can_request(desc)) {
558 if (desc->action) 558 if (!desc->action ||
559 if (irqflags & desc->action->flags & IRQF_SHARED) 559 irqflags & desc->action->flags & IRQF_SHARED)
560 canrequest =1; 560 canrequest = 1;
561 } 561 }
562 irq_put_desc_unlock(desc, flags); 562 irq_put_desc_unlock(desc, flags);
563 return canrequest; 563 return canrequest;
@@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused)
840static int irq_thread(void *data) 840static int irq_thread(void *data)
841{ 841{
842 struct callback_head on_exit_work; 842 struct callback_head on_exit_work;
843 static const struct sched_param param = {
844 .sched_priority = MAX_USER_RT_PRIO/2,
845 };
846 struct irqaction *action = data; 843 struct irqaction *action = data;
847 struct irq_desc *desc = irq_to_desc(action->irq); 844 struct irq_desc *desc = irq_to_desc(action->irq);
848 irqreturn_t (*handler_fn)(struct irq_desc *desc, 845 irqreturn_t (*handler_fn)(struct irq_desc *desc,
@@ -854,8 +851,6 @@ static int irq_thread(void *data)
854 else 851 else
855 handler_fn = irq_thread_fn; 852 handler_fn = irq_thread_fn;
856 853
857 sched_setscheduler(current, SCHED_FIFO, &param);
858
859 init_task_work(&on_exit_work, irq_thread_dtor); 854 init_task_work(&on_exit_work, irq_thread_dtor);
860 task_work_add(current, &on_exit_work, false); 855 task_work_add(current, &on_exit_work, false);
861 856
@@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
950 */ 945 */
951 if (new->thread_fn && !nested) { 946 if (new->thread_fn && !nested) {
952 struct task_struct *t; 947 struct task_struct *t;
948 static const struct sched_param param = {
949 .sched_priority = MAX_USER_RT_PRIO/2,
950 };
953 951
954 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 952 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
955 new->name); 953 new->name);
@@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
957 ret = PTR_ERR(t); 955 ret = PTR_ERR(t);
958 goto out_mput; 956 goto out_mput;
959 } 957 }
958
959 sched_setscheduler(t, SCHED_FIFO, &param);
960
960 /* 961 /*
961 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
962 * the thread dies to avoid that the interrupt code 963 * the thread dies to avoid that the interrupt code
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcdf..bddf3b201a48 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
467/* Optimization staging list, protected by kprobe_mutex */ 467/* Optimization staging list, protected by kprobe_mutex */
468static LIST_HEAD(optimizing_list); 468static LIST_HEAD(optimizing_list);
469static LIST_HEAD(unoptimizing_list); 469static LIST_HEAD(unoptimizing_list);
470static LIST_HEAD(freeing_list);
470 471
471static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
504 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 505 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
505 * if need) kprobes listed on unoptimizing_list. 506 * if need) kprobes listed on unoptimizing_list.
506 */ 507 */
507static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) 508static __kprobes void do_unoptimize_kprobes(void)
508{ 509{
509 struct optimized_kprobe *op, *tmp; 510 struct optimized_kprobe *op, *tmp;
510 511
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
515 /* Ditto to do_optimize_kprobes */ 516 /* Ditto to do_optimize_kprobes */
516 get_online_cpus(); 517 get_online_cpus();
517 mutex_lock(&text_mutex); 518 mutex_lock(&text_mutex);
518 arch_unoptimize_kprobes(&unoptimizing_list, free_list); 519 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
519 /* Loop free_list for disarming */ 520 /* Loop free_list for disarming */
520 list_for_each_entry_safe(op, tmp, free_list, list) { 521 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
521 /* Disarm probes if marked disabled */ 522 /* Disarm probes if marked disabled */
522 if (kprobe_disabled(&op->kp)) 523 if (kprobe_disabled(&op->kp))
523 arch_disarm_kprobe(&op->kp); 524 arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
536} 537}
537 538
538/* Reclaim all kprobes on the free_list */ 539/* Reclaim all kprobes on the free_list */
539static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) 540static __kprobes void do_free_cleaned_kprobes(void)
540{ 541{
541 struct optimized_kprobe *op, *tmp; 542 struct optimized_kprobe *op, *tmp;
542 543
543 list_for_each_entry_safe(op, tmp, free_list, list) { 544 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
544 BUG_ON(!kprobe_unused(&op->kp)); 545 BUG_ON(!kprobe_unused(&op->kp));
545 list_del_init(&op->list); 546 list_del_init(&op->list);
546 free_aggr_kprobe(&op->kp); 547 free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
556/* Kprobe jump optimizer */ 557/* Kprobe jump optimizer */
557static __kprobes void kprobe_optimizer(struct work_struct *work) 558static __kprobes void kprobe_optimizer(struct work_struct *work)
558{ 559{
559 LIST_HEAD(free_list);
560
561 mutex_lock(&kprobe_mutex); 560 mutex_lock(&kprobe_mutex);
562 /* Lock modules while optimizing kprobes */ 561 /* Lock modules while optimizing kprobes */
563 mutex_lock(&module_mutex); 562 mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
566 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 565 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
567 * kprobes before waiting for quiesence period. 566 * kprobes before waiting for quiesence period.
568 */ 567 */
569 do_unoptimize_kprobes(&free_list); 568 do_unoptimize_kprobes();
570 569
571 /* 570 /*
572 * Step 2: Wait for quiesence period to ensure all running interrupts 571 * Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
581 do_optimize_kprobes(); 580 do_optimize_kprobes();
582 581
583 /* Step 4: Free cleaned kprobes after quiesence period */ 582 /* Step 4: Free cleaned kprobes after quiesence period */
584 do_free_cleaned_kprobes(&free_list); 583 do_free_cleaned_kprobes();
585 584
586 mutex_unlock(&module_mutex); 585 mutex_unlock(&module_mutex);
587 mutex_unlock(&kprobe_mutex); 586 mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
723 if (!list_empty(&op->list)) 722 if (!list_empty(&op->list))
724 /* Dequeue from the (un)optimization queue */ 723 /* Dequeue from the (un)optimization queue */
725 list_del_init(&op->list); 724 list_del_init(&op->list);
726
727 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 725 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
726
727 if (kprobe_unused(p)) {
728 /* Enqueue if it is unused */
729 list_add(&op->list, &freeing_list);
730 /*
731 * Remove unused probes from the hash list. After waiting
732 * for synchronization, this probe is reclaimed.
733 * (reclaiming is done by do_free_cleaned_kprobes().)
734 */
735 hlist_del_rcu(&op->kp.hlist);
736 }
737
728 /* Don't touch the code, because it is already freed. */ 738 /* Don't touch the code, because it is already freed. */
729 arch_remove_optimized_kprobe(op); 739 arch_remove_optimized_kprobe(op);
730} 740}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ad53a664f113..e581ada5faf4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock)
254 254
255EXPORT_SYMBOL(mutex_unlock); 255EXPORT_SYMBOL(mutex_unlock);
256 256
257/**
258 * ww_mutex_unlock - release the w/w mutex
259 * @lock: the mutex to be released
260 *
261 * Unlock a mutex that has been locked by this task previously with any of the
262 * ww_mutex_lock* functions (with or without an acquire context). It is
263 * forbidden to release the locks after releasing the acquire context.
264 *
265 * This function must not be used in interrupt context. Unlocking
266 * of a unlocked mutex is not allowed.
267 */
268void __sched ww_mutex_unlock(struct ww_mutex *lock)
269{
270 /*
271 * The unlocking fastpath is the 0->1 transition from 'locked'
272 * into 'unlocked' state:
273 */
274 if (lock->ctx) {
275#ifdef CONFIG_DEBUG_MUTEXES
276 DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
277#endif
278 if (lock->ctx->acquired > 0)
279 lock->ctx->acquired--;
280 lock->ctx = NULL;
281 }
282
283#ifndef CONFIG_DEBUG_MUTEXES
284 /*
285 * When debugging is enabled we must not clear the owner before time,
286 * the slow path will always be taken, and that clears the owner field
287 * after verifying that it was indeed current.
288 */
289 mutex_clear_owner(&lock->base);
290#endif
291 __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
292}
293EXPORT_SYMBOL(ww_mutex_unlock);
294
295static inline int __sched
296__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
297{
298 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
299 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
300
301 if (!hold_ctx)
302 return 0;
303
304 if (unlikely(ctx == hold_ctx))
305 return -EALREADY;
306
307 if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
308 (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
309#ifdef CONFIG_DEBUG_MUTEXES
310 DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
311 ctx->contending_lock = ww;
312#endif
313 return -EDEADLK;
314 }
315
316 return 0;
317}
318
319static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
320 struct ww_acquire_ctx *ww_ctx)
321{
322#ifdef CONFIG_DEBUG_MUTEXES
323 /*
324 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
325 * but released with a normal mutex_unlock in this call.
326 *
327 * This should never happen, always use ww_mutex_unlock.
328 */
329 DEBUG_LOCKS_WARN_ON(ww->ctx);
330
331 /*
332 * Not quite done after calling ww_acquire_done() ?
333 */
334 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
335
336 if (ww_ctx->contending_lock) {
337 /*
338 * After -EDEADLK you tried to
339 * acquire a different ww_mutex? Bad!
340 */
341 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
342
343 /*
344 * You called ww_mutex_lock after receiving -EDEADLK,
345 * but 'forgot' to unlock everything else first?
346 */
347 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
348 ww_ctx->contending_lock = NULL;
349 }
350
351 /*
352 * Naughty, using a different class will lead to undefined behavior!
353 */
354 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
355#endif
356 ww_ctx->acquired++;
357}
358
359/*
360 * after acquiring lock with fastpath or when we lost out in contested
361 * slowpath, set ctx and wake up any waiters so they can recheck.
362 *
363 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
364 * as the fastpath and opportunistic spinning are disabled in that case.
365 */
366static __always_inline void
367ww_mutex_set_context_fastpath(struct ww_mutex *lock,
368 struct ww_acquire_ctx *ctx)
369{
370 unsigned long flags;
371 struct mutex_waiter *cur;
372
373 ww_mutex_lock_acquired(lock, ctx);
374
375 lock->ctx = ctx;
376
377 /*
378 * The lock->ctx update should be visible on all cores before
379 * the atomic read is done, otherwise contended waiters might be
380 * missed. The contended waiters will either see ww_ctx == NULL
381 * and keep spinning, or it will acquire wait_lock, add itself
382 * to waiter list and sleep.
383 */
384 smp_mb(); /* ^^^ */
385
386 /*
387 * Check if lock is contended, if not there is nobody to wake up
388 */
389 if (likely(atomic_read(&lock->base.count) == 0))
390 return;
391
392 /*
393 * Uh oh, we raced in fastpath, wake up everyone in this case,
394 * so they can see the new lock->ctx.
395 */
396 spin_lock_mutex(&lock->base.wait_lock, flags);
397 list_for_each_entry(cur, &lock->base.wait_list, list) {
398 debug_mutex_wake_waiter(&lock->base, cur);
399 wake_up_process(cur->task);
400 }
401 spin_unlock_mutex(&lock->base.wait_lock, flags);
402}
403
257/* 404/*
258 * Lock a mutex (possibly interruptible), slowpath: 405 * Lock a mutex (possibly interruptible), slowpath:
259 */ 406 */
260static inline int __sched 407static __always_inline int __sched
261__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 408__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
262 struct lockdep_map *nest_lock, unsigned long ip) 409 struct lockdep_map *nest_lock, unsigned long ip,
410 struct ww_acquire_ctx *ww_ctx)
263{ 411{
264 struct task_struct *task = current; 412 struct task_struct *task = current;
265 struct mutex_waiter waiter; 413 struct mutex_waiter waiter;
266 unsigned long flags; 414 unsigned long flags;
415 int ret;
267 416
268 preempt_disable(); 417 preempt_disable();
269 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 418 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
298 struct task_struct *owner; 447 struct task_struct *owner;
299 struct mspin_node node; 448 struct mspin_node node;
300 449
450 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
451 struct ww_mutex *ww;
452
453 ww = container_of(lock, struct ww_mutex, base);
454 /*
455 * If ww->ctx is set the contents are undefined, only
456 * by acquiring wait_lock there is a guarantee that
457 * they are not invalid when reading.
458 *
459 * As such, when deadlock detection needs to be
460 * performed the optimistic spinning cannot be done.
461 */
462 if (ACCESS_ONCE(ww->ctx))
463 break;
464 }
465
301 /* 466 /*
302 * If there's an owner, wait for it to either 467 * If there's an owner, wait for it to either
303 * release the lock or go to sleep. 468 * release the lock or go to sleep.
@@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
312 if ((atomic_read(&lock->count) == 1) && 477 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 478 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
314 lock_acquired(&lock->dep_map, ip); 479 lock_acquired(&lock->dep_map, ip);
480 if (!__builtin_constant_p(ww_ctx == NULL)) {
481 struct ww_mutex *ww;
482 ww = container_of(lock, struct ww_mutex, base);
483
484 ww_mutex_set_context_fastpath(ww, ww_ctx);
485 }
486
315 mutex_set_owner(lock); 487 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node); 488 mspin_unlock(MLOCK(lock), &node);
317 preempt_enable(); 489 preempt_enable();
@@ -371,15 +543,16 @@ slowpath:
371 * TASK_UNINTERRUPTIBLE case.) 543 * TASK_UNINTERRUPTIBLE case.)
372 */ 544 */
373 if (unlikely(signal_pending_state(state, task))) { 545 if (unlikely(signal_pending_state(state, task))) {
374 mutex_remove_waiter(lock, &waiter, 546 ret = -EINTR;
375 task_thread_info(task)); 547 goto err;
376 mutex_release(&lock->dep_map, 1, ip); 548 }
377 spin_unlock_mutex(&lock->wait_lock, flags);
378 549
379 debug_mutex_free_waiter(&waiter); 550 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
380 preempt_enable(); 551 ret = __mutex_lock_check_stamp(lock, ww_ctx);
381 return -EINTR; 552 if (ret)
553 goto err;
382 } 554 }
555
383 __set_task_state(task, state); 556 __set_task_state(task, state);
384 557
385 /* didn't get the lock, go to sleep: */ 558 /* didn't get the lock, go to sleep: */
@@ -394,6 +567,30 @@ done:
394 mutex_remove_waiter(lock, &waiter, current_thread_info()); 567 mutex_remove_waiter(lock, &waiter, current_thread_info());
395 mutex_set_owner(lock); 568 mutex_set_owner(lock);
396 569
570 if (!__builtin_constant_p(ww_ctx == NULL)) {
571 struct ww_mutex *ww = container_of(lock,
572 struct ww_mutex,
573 base);
574 struct mutex_waiter *cur;
575
576 /*
577 * This branch gets optimized out for the common case,
578 * and is only important for ww_mutex_lock.
579 */
580
581 ww_mutex_lock_acquired(ww, ww_ctx);
582 ww->ctx = ww_ctx;
583
584 /*
585 * Give any possible sleeping processes the chance to wake up,
586 * so they can recheck if they have to back off.
587 */
588 list_for_each_entry(cur, &lock->wait_list, list) {
589 debug_mutex_wake_waiter(lock, cur);
590 wake_up_process(cur->task);
591 }
592 }
593
397 /* set it to 0 if there are no waiters left: */ 594 /* set it to 0 if there are no waiters left: */
398 if (likely(list_empty(&lock->wait_list))) 595 if (likely(list_empty(&lock->wait_list)))
399 atomic_set(&lock->count, 0); 596 atomic_set(&lock->count, 0);
@@ -404,6 +601,14 @@ done:
404 preempt_enable(); 601 preempt_enable();
405 602
406 return 0; 603 return 0;
604
605err:
606 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
607 spin_unlock_mutex(&lock->wait_lock, flags);
608 debug_mutex_free_waiter(&waiter);
609 mutex_release(&lock->dep_map, 1, ip);
610 preempt_enable();
611 return ret;
407} 612}
408 613
409#ifdef CONFIG_DEBUG_LOCK_ALLOC 614#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -411,7 +616,8 @@ void __sched
411mutex_lock_nested(struct mutex *lock, unsigned int subclass) 616mutex_lock_nested(struct mutex *lock, unsigned int subclass)
412{ 617{
413 might_sleep(); 618 might_sleep();
414 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); 619 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
620 subclass, NULL, _RET_IP_, NULL);
415} 621}
416 622
417EXPORT_SYMBOL_GPL(mutex_lock_nested); 623EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -420,7 +626,8 @@ void __sched
420_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) 626_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
421{ 627{
422 might_sleep(); 628 might_sleep();
423 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); 629 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
630 0, nest, _RET_IP_, NULL);
424} 631}
425 632
426EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 633EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -429,7 +636,8 @@ int __sched
429mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 636mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
430{ 637{
431 might_sleep(); 638 might_sleep();
432 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); 639 return __mutex_lock_common(lock, TASK_KILLABLE,
640 subclass, NULL, _RET_IP_, NULL);
433} 641}
434EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 642EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
435 643
@@ -438,10 +646,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
438{ 646{
439 might_sleep(); 647 might_sleep();
440 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 648 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
441 subclass, NULL, _RET_IP_); 649 subclass, NULL, _RET_IP_, NULL);
442} 650}
443 651
444EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 652EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
653
654static inline int
655ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
656{
657#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
658 unsigned tmp;
659
660 if (ctx->deadlock_inject_countdown-- == 0) {
661 tmp = ctx->deadlock_inject_interval;
662 if (tmp > UINT_MAX/4)
663 tmp = UINT_MAX;
664 else
665 tmp = tmp*2 + tmp + tmp/2;
666
667 ctx->deadlock_inject_interval = tmp;
668 ctx->deadlock_inject_countdown = tmp;
669 ctx->contending_lock = lock;
670
671 ww_mutex_unlock(lock);
672
673 return -EDEADLK;
674 }
675#endif
676
677 return 0;
678}
679
680int __sched
681__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
682{
683 int ret;
684
685 might_sleep();
686 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
687 0, &ctx->dep_map, _RET_IP_, ctx);
688 if (!ret && ctx->acquired > 0)
689 return ww_mutex_deadlock_injection(lock, ctx);
690
691 return ret;
692}
693EXPORT_SYMBOL_GPL(__ww_mutex_lock);
694
695int __sched
696__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
697{
698 int ret;
699
700 might_sleep();
701 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
702 0, &ctx->dep_map, _RET_IP_, ctx);
703
704 if (!ret && ctx->acquired > 0)
705 return ww_mutex_deadlock_injection(lock, ctx);
706
707 return ret;
708}
709EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
710
445#endif 711#endif
446 712
447/* 713/*
@@ -494,10 +760,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
494 * mutex_lock_interruptible() and mutex_trylock(). 760 * mutex_lock_interruptible() and mutex_trylock().
495 */ 761 */
496static noinline int __sched 762static noinline int __sched
497__mutex_lock_killable_slowpath(atomic_t *lock_count); 763__mutex_lock_killable_slowpath(struct mutex *lock);
498 764
499static noinline int __sched 765static noinline int __sched
500__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 766__mutex_lock_interruptible_slowpath(struct mutex *lock);
501 767
502/** 768/**
503 * mutex_lock_interruptible - acquire the mutex, interruptible 769 * mutex_lock_interruptible - acquire the mutex, interruptible
@@ -515,12 +781,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
515 int ret; 781 int ret;
516 782
517 might_sleep(); 783 might_sleep();
518 ret = __mutex_fastpath_lock_retval 784 ret = __mutex_fastpath_lock_retval(&lock->count);
519 (&lock->count, __mutex_lock_interruptible_slowpath); 785 if (likely(!ret)) {
520 if (!ret)
521 mutex_set_owner(lock); 786 mutex_set_owner(lock);
522 787 return 0;
523 return ret; 788 } else
789 return __mutex_lock_interruptible_slowpath(lock);
524} 790}
525 791
526EXPORT_SYMBOL(mutex_lock_interruptible); 792EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -530,12 +796,12 @@ int __sched mutex_lock_killable(struct mutex *lock)
530 int ret; 796 int ret;
531 797
532 might_sleep(); 798 might_sleep();
533 ret = __mutex_fastpath_lock_retval 799 ret = __mutex_fastpath_lock_retval(&lock->count);
534 (&lock->count, __mutex_lock_killable_slowpath); 800 if (likely(!ret)) {
535 if (!ret)
536 mutex_set_owner(lock); 801 mutex_set_owner(lock);
537 802 return 0;
538 return ret; 803 } else
804 return __mutex_lock_killable_slowpath(lock);
539} 805}
540EXPORT_SYMBOL(mutex_lock_killable); 806EXPORT_SYMBOL(mutex_lock_killable);
541 807
@@ -544,24 +810,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
544{ 810{
545 struct mutex *lock = container_of(lock_count, struct mutex, count); 811 struct mutex *lock = container_of(lock_count, struct mutex, count);
546 812
547 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); 813 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
814 NULL, _RET_IP_, NULL);
548} 815}
549 816
550static noinline int __sched 817static noinline int __sched
551__mutex_lock_killable_slowpath(atomic_t *lock_count) 818__mutex_lock_killable_slowpath(struct mutex *lock)
552{ 819{
553 struct mutex *lock = container_of(lock_count, struct mutex, count); 820 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
821 NULL, _RET_IP_, NULL);
822}
554 823
555 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); 824static noinline int __sched
825__mutex_lock_interruptible_slowpath(struct mutex *lock)
826{
827 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
828 NULL, _RET_IP_, NULL);
556} 829}
557 830
558static noinline int __sched 831static noinline int __sched
559__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 832__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
560{ 833{
561 struct mutex *lock = container_of(lock_count, struct mutex, count); 834 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
835 NULL, _RET_IP_, ctx);
836}
562 837
563 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); 838static noinline int __sched
839__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
840 struct ww_acquire_ctx *ctx)
841{
842 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
843 NULL, _RET_IP_, ctx);
564} 844}
845
565#endif 846#endif
566 847
567/* 848/*
@@ -617,6 +898,45 @@ int __sched mutex_trylock(struct mutex *lock)
617} 898}
618EXPORT_SYMBOL(mutex_trylock); 899EXPORT_SYMBOL(mutex_trylock);
619 900
901#ifndef CONFIG_DEBUG_LOCK_ALLOC
902int __sched
903__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
904{
905 int ret;
906
907 might_sleep();
908
909 ret = __mutex_fastpath_lock_retval(&lock->base.count);
910
911 if (likely(!ret)) {
912 ww_mutex_set_context_fastpath(lock, ctx);
913 mutex_set_owner(&lock->base);
914 } else
915 ret = __ww_mutex_lock_slowpath(lock, ctx);
916 return ret;
917}
918EXPORT_SYMBOL(__ww_mutex_lock);
919
920int __sched
921__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
922{
923 int ret;
924
925 might_sleep();
926
927 ret = __mutex_fastpath_lock_retval(&lock->base.count);
928
929 if (likely(!ret)) {
930 ww_mutex_set_context_fastpath(lock, ctx);
931 mutex_set_owner(&lock->base);
932 } else
933 ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
934 return ret;
935}
936EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
937
938#endif
939
620/** 940/**
621 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 941 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
622 * @cnt: the atomic which we are to dec 942 * @cnt: the atomic which we are to dec
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180b..d444c4e834f4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,7 +100,6 @@ config PM_SLEEP_SMP
100 depends on SMP 100 depends on SMP
101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
102 depends on PM_SLEEP 102 depends on PM_SLEEP
103 select HOTPLUG
104 select HOTPLUG_CPU 103 select HOTPLUG_CPU
105 104
106config PM_AUTOSLEEP 105config PM_AUTOSLEEP
@@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS
263 bool 262 bool
264 depends on PM 263 depends on PM
265 264
265config WQ_POWER_EFFICIENT_DEFAULT
266 bool "Enable workqueue power-efficient mode by default"
267 depends on PM
268 default n
269 help
270 Per-cpu workqueues are generally preferred because they show
271 better performance thanks to cache locality; unfortunately,
272 per-cpu workqueues tend to be more power hungry than unbound
273 workqueues.
274
275 Enabling workqueue.power_efficient kernel parameter makes the
276 per-cpu workqueues which were observed to contribute
277 significantly to power consumption unbound, leading to measurably
278 lower power usage at the cost of small performance overhead.
279
280 This config option determines whether workqueue.power_efficient
281 is enabled by default.
282
283 If in doubt, say N.
284
266config PM_GENERIC_DOMAINS_SLEEP 285config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y 286 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS 287 depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e1494420..8212c1aef125 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
363 log_next_seq++; 363 log_next_seq++;
364} 364}
365 365
366#ifdef CONFIG_SECURITY_DMESG_RESTRICT
367int dmesg_restrict = 1;
368#else
369int dmesg_restrict;
370#endif
371
372static int syslog_action_restricted(int type)
373{
374 if (dmesg_restrict)
375 return 1;
376 /*
377 * Unless restricted, we allow "read all" and "get buffer size"
378 * for everybody.
379 */
380 return type != SYSLOG_ACTION_READ_ALL &&
381 type != SYSLOG_ACTION_SIZE_BUFFER;
382}
383
384static int check_syslog_permissions(int type, bool from_file)
385{
386 /*
387 * If this is from /proc/kmsg and we've already opened it, then we've
388 * already done the capabilities checks at open time.
389 */
390 if (from_file && type != SYSLOG_ACTION_OPEN)
391 return 0;
392
393 if (syslog_action_restricted(type)) {
394 if (capable(CAP_SYSLOG))
395 return 0;
396 /*
397 * For historical reasons, accept CAP_SYS_ADMIN too, with
398 * a warning.
399 */
400 if (capable(CAP_SYS_ADMIN)) {
401 pr_warn_once("%s (%d): Attempt to access syslog with "
402 "CAP_SYS_ADMIN but no CAP_SYSLOG "
403 "(deprecated).\n",
404 current->comm, task_pid_nr(current));
405 return 0;
406 }
407 return -EPERM;
408 }
409 return security_syslog(type);
410}
411
412
366/* /dev/kmsg - userspace message inject/listen interface */ 413/* /dev/kmsg - userspace message inject/listen interface */
367struct devkmsg_user { 414struct devkmsg_user {
368 u64 seq; 415 u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
620 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 667 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
621 return 0; 668 return 0;
622 669
623 err = security_syslog(SYSLOG_ACTION_READ_ALL); 670 err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
671 SYSLOG_FROM_READER);
624 if (err) 672 if (err)
625 return err; 673 return err;
626 674
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
813} 861}
814#endif 862#endif
815 863
816#ifdef CONFIG_SECURITY_DMESG_RESTRICT
817int dmesg_restrict = 1;
818#else
819int dmesg_restrict;
820#endif
821
822static int syslog_action_restricted(int type)
823{
824 if (dmesg_restrict)
825 return 1;
826 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
827 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
828}
829
830static int check_syslog_permissions(int type, bool from_file)
831{
832 /*
833 * If this is from /proc/kmsg and we've already opened it, then we've
834 * already done the capabilities checks at open time.
835 */
836 if (from_file && type != SYSLOG_ACTION_OPEN)
837 return 0;
838
839 if (syslog_action_restricted(type)) {
840 if (capable(CAP_SYSLOG))
841 return 0;
842 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
843 if (capable(CAP_SYS_ADMIN)) {
844 printk_once(KERN_WARNING "%s (%d): "
845 "Attempt to access syslog with CAP_SYS_ADMIN "
846 "but no CAP_SYSLOG (deprecated).\n",
847 current->comm, task_pid_nr(current));
848 return 0;
849 }
850 return -EPERM;
851 }
852 return 0;
853}
854
855#if defined(CONFIG_PRINTK_TIME) 864#if defined(CONFIG_PRINTK_TIME)
856static bool printk_time = 1; 865static bool printk_time = 1;
857#else 866#else
@@ -1249,7 +1258,7 @@ out:
1249 1258
1250SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 1259SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1251{ 1260{
1252 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1261 return do_syslog(type, buf, len, SYSLOG_FROM_READER);
1253} 1262}
1254 1263
1255/* 1264/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a3f69c..335a7ae697f5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,
665 if (unlikely(is_compat_task())) { 665 if (unlikely(is_compat_task())) {
666 compat_siginfo_t __user *uinfo = compat_ptr(data); 666 compat_siginfo_t __user *uinfo = compat_ptr(data);
667 667
668 ret = copy_siginfo_to_user32(uinfo, &info); 668 if (copy_siginfo_to_user32(uinfo, &info) ||
669 ret |= __put_user(info.si_code, &uinfo->si_code); 669 __put_user(info.si_code, &uinfo->si_code)) {
670 ret = -EFAULT;
671 break;
672 }
673
670 } else 674 } else
671#endif 675#endif
672 { 676 {
673 siginfo_t __user *uinfo = (siginfo_t __user *) data; 677 siginfo_t __user *uinfo = (siginfo_t __user *) data;
674 678
675 ret = copy_siginfo_to_user(uinfo, &info); 679 if (copy_siginfo_to_user(uinfo, &info) ||
676 ret |= __put_user(info.si_code, &uinfo->si_code); 680 __put_user(info.si_code, &uinfo->si_code)) {
677 } 681 ret = -EFAULT;
678 682 break;
679 if (ret) { 683 }
680 ret = -EFAULT;
681 break;
682 } 684 }
683 685
684 data += sizeof(siginfo_t); 686 data += sizeof(siginfo_t);
diff --git a/kernel/range.c b/kernel/range.c
index eb911dbce267..322ea8e93e4b 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7#include <linux/string.h>
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
32 if (start >= end) 32 if (start >= end)
33 return nr_range; 33 return nr_range;
34 34
35 /* Try to merge it with old one: */ 35 /* get new start/end: */
36 for (i = 0; i < nr_range; i++) { 36 for (i = 0; i < nr_range; i++) {
37 u64 final_start, final_end;
38 u64 common_start, common_end; 37 u64 common_start, common_end;
39 38
40 if (!range[i].end) 39 if (!range[i].end)
@@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
45 if (common_start > common_end) 44 if (common_start > common_end)
46 continue; 45 continue;
47 46
48 final_start = min(range[i].start, start); 47 /* new start/end, will add it back at last */
49 final_end = max(range[i].end, end); 48 start = min(range[i].start, start);
49 end = max(range[i].end, end);
50 50
51 /* clear it and add it back for further merge */ 51 memmove(&range[i], &range[i + 1],
52 range[i].start = 0; 52 (nr_range - (i + 1)) * sizeof(range[i]));
53 range[i].end = 0; 53 range[nr_range - 1].start = 0;
54 return add_range_with_merge(range, az, nr_range, 54 range[nr_range - 1].end = 0;
55 final_start, final_end); 55 nr_range--;
56 i--;
56 } 57 }
57 58
58 /* Need to add it: */ 59 /* Need to add it: */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48ab70384a4c..cce6ba8bbace 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void)
104} 104}
105EXPORT_SYMBOL_GPL(__rcu_read_unlock); 105EXPORT_SYMBOL_GPL(__rcu_read_unlock);
106 106
107/* 107#endif /* #ifdef CONFIG_PREEMPT_RCU */
108 * Check for a task exiting while in a preemptible-RCU read-side
109 * critical section, clean up if so. No need to issue warnings,
110 * as debug_check_no_locks_held() already does this if lockdep
111 * is enabled.
112 */
113void exit_rcu(void)
114{
115 struct task_struct *t = current;
116
117 if (likely(list_empty(&current->rcu_node_entry)))
118 return;
119 t->rcu_read_lock_nesting = 1;
120 barrier();
121 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
122 __rcu_read_unlock();
123}
124
125#else /* #ifdef CONFIG_PREEMPT_RCU */
126
127void exit_rcu(void)
128{
129}
130
131#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
132 108
133#ifdef CONFIG_DEBUG_LOCK_ALLOC 109#ifdef CONFIG_DEBUG_LOCK_ALLOC
134static struct lock_class_key rcu_lock_key; 110static struct lock_class_key rcu_lock_key;
@@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key;
145struct lockdep_map rcu_sched_lock_map = 121struct lockdep_map rcu_sched_lock_map =
146 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
147EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 123EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
148#endif
149
150#ifdef CONFIG_DEBUG_LOCK_ALLOC
151 124
152int debug_lockdep_rcu_enabled(void) 125int debug_lockdep_rcu_enabled(void)
153{ 126{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index a0714a51b6d7..aa344111de3e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,7 +44,6 @@
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_callbacks(void);
48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static void rcu_process_callbacks(struct softirq_action *unused); 48static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
@@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
205 */ 204 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 205static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 206{
208 reset_cpu_stall_ticks(rcp); 207 RCU_TRACE(reset_cpu_stall_ticks(rcp));
209 if (rcp->rcucblist != NULL && 208 if (rcp->rcucblist != NULL &&
210 rcp->donetail != rcp->curtail) { 209 rcp->donetail != rcp->curtail) {
211 rcp->donetail = rcp->curtail; 210 rcp->donetail = rcp->curtail;
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)
227 local_irq_save(flags); 226 local_irq_save(flags);
228 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 227 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
229 rcu_qsctr_help(&rcu_bh_ctrlblk)) 228 rcu_qsctr_help(&rcu_bh_ctrlblk))
230 invoke_rcu_callbacks(); 229 raise_softirq(RCU_SOFTIRQ);
231 local_irq_restore(flags); 230 local_irq_restore(flags);
232} 231}
233 232
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)
240 239
241 local_irq_save(flags); 240 local_irq_save(flags);
242 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 241 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
243 invoke_rcu_callbacks(); 242 raise_softirq(RCU_SOFTIRQ);
244 local_irq_restore(flags); 243 local_irq_restore(flags);
245} 244}
246 245
@@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu)
252 */ 251 */
253void rcu_check_callbacks(int cpu, int user) 252void rcu_check_callbacks(int cpu, int user)
254{ 253{
255 check_cpu_stalls(); 254 RCU_TRACE(check_cpu_stalls());
256 if (user || rcu_is_cpu_rrupt_from_idle()) 255 if (user || rcu_is_cpu_rrupt_from_idle())
257 rcu_sched_qs(cpu); 256 rcu_sched_qs(cpu);
258 else if (!in_softirq()) 257 else if (!in_softirq())
259 rcu_bh_qs(cpu); 258 rcu_bh_qs(cpu);
260 rcu_preempt_check_callbacks();
261} 259}
262 260
263/* 261/*
@@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
278 ACCESS_ONCE(rcp->rcucblist), 276 ACCESS_ONCE(rcp->rcucblist),
279 need_resched(), 277 need_resched(),
280 is_idle_task(current), 278 is_idle_task(current),
281 rcu_is_callbacks_kthread())); 279 false));
282 return; 280 return;
283 } 281 }
284 282
@@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
290 *rcp->donetail = NULL; 288 *rcp->donetail = NULL;
291 if (rcp->curtail == rcp->donetail) 289 if (rcp->curtail == rcp->donetail)
292 rcp->curtail = &rcp->rcucblist; 290 rcp->curtail = &rcp->rcucblist;
293 rcu_preempt_remove_callbacks(rcp);
294 rcp->donetail = &rcp->rcucblist; 291 rcp->donetail = &rcp->rcucblist;
295 local_irq_restore(flags); 292 local_irq_restore(flags);
296 293
@@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
309 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
310 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
311 is_idle_task(current), 308 is_idle_task(current),
312 rcu_is_callbacks_kthread())); 309 false));
313} 310}
314 311
315static void rcu_process_callbacks(struct softirq_action *unused) 312static void rcu_process_callbacks(struct softirq_action *unused)
316{ 313{
317 __rcu_process_callbacks(&rcu_sched_ctrlblk); 314 __rcu_process_callbacks(&rcu_sched_ctrlblk);
318 __rcu_process_callbacks(&rcu_bh_ctrlblk); 315 __rcu_process_callbacks(&rcu_bh_ctrlblk);
319 rcu_preempt_process_callbacks();
320} 316}
321 317
322/* 318/*
@@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
382 __call_rcu(head, func, &rcu_bh_ctrlblk); 378 __call_rcu(head, func, &rcu_bh_ctrlblk);
383} 379}
384EXPORT_SYMBOL_GPL(call_rcu_bh); 380EXPORT_SYMBOL_GPL(call_rcu_bh);
381
382void rcu_init(void)
383{
384 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
385}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 8a233002faeb..0cd385acccfa 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#ifdef CONFIG_DEBUG_LOCK_ALLOC
56#include <linux/kernel_stat.h>
57
56int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
57EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
105#ifdef CONFIG_TINY_PREEMPT_RCU
106
107#include <linux/delay.h>
108
109/* Global control variables for preemptible RCU. */
110struct rcu_preempt_ctrlblk {
111 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
112 struct rcu_head **nexttail;
113 /* Tasks blocked in a preemptible RCU */
114 /* read-side critical section while an */
115 /* preemptible-RCU grace period is in */
116 /* progress must wait for a later grace */
117 /* period. This pointer points to the */
118 /* ->next pointer of the last task that */
119 /* must wait for a later grace period, or */
120 /* to &->rcb.rcucblist if there is no */
121 /* such task. */
122 struct list_head blkd_tasks;
123 /* Tasks blocked in RCU read-side critical */
124 /* section. Tasks are placed at the head */
125 /* of this list and age towards the tail. */
126 struct list_head *gp_tasks;
127 /* Pointer to the first task blocking the */
128 /* current grace period, or NULL if there */
129 /* is no such task. */
130 struct list_head *exp_tasks;
131 /* Pointer to first task blocking the */
132 /* current expedited grace period, or NULL */
133 /* if there is no such task. If there */
134 /* is no current expedited grace period, */
135 /* then there cannot be any such task. */
136#ifdef CONFIG_RCU_BOOST
137 struct list_head *boost_tasks;
138 /* Pointer to first task that needs to be */
139 /* priority-boosted, or NULL if no priority */
140 /* boosting is needed. If there is no */
141 /* current or expedited grace period, there */
142 /* can be no such task. */
143#endif /* #ifdef CONFIG_RCU_BOOST */
144 u8 gpnum; /* Current grace period. */
145 u8 gpcpu; /* Last grace period blocked by the CPU. */
146 u8 completed; /* Last grace period completed. */
147 /* If all three are equal, RCU is idle. */
148#ifdef CONFIG_RCU_BOOST
149 unsigned long boost_time; /* When to start boosting (jiffies) */
150#endif /* #ifdef CONFIG_RCU_BOOST */
151#ifdef CONFIG_RCU_TRACE
152 unsigned long n_grace_periods;
153#ifdef CONFIG_RCU_BOOST
154 unsigned long n_tasks_boosted;
155 /* Total number of tasks boosted. */
156 unsigned long n_exp_boosts;
157 /* Number of tasks boosted for expedited GP. */
158 unsigned long n_normal_boosts;
159 /* Number of tasks boosted for normal GP. */
160 unsigned long n_balk_blkd_tasks;
161 /* Refused to boost: no blocked tasks. */
162 unsigned long n_balk_exp_gp_tasks;
163 /* Refused to boost: nothing blocking GP. */
164 unsigned long n_balk_boost_tasks;
165 /* Refused to boost: already boosting. */
166 unsigned long n_balk_notyet;
167 /* Refused to boost: not yet time. */
168 unsigned long n_balk_nos;
169 /* Refused to boost: not sure why, though. */
170 /* This can happen due to race conditions. */
171#endif /* #ifdef CONFIG_RCU_BOOST */
172#endif /* #ifdef CONFIG_RCU_TRACE */
173};
174
175static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
176 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
177 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
178 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
179 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
180 RCU_TRACE(.rcb.name = "rcu_preempt")
181};
182
183static int rcu_preempted_readers_exp(void);
184static void rcu_report_exp_done(void);
185
186/*
187 * Return true if the CPU has not yet responded to the current grace period.
188 */
189static int rcu_cpu_blocking_cur_gp(void)
190{
191 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
192}
193
194/*
195 * Check for a running RCU reader. Because there is only one CPU,
196 * there can be but one running RCU reader at a time. ;-)
197 *
198 * Returns zero if there are no running readers. Returns a positive
199 * number if there is at least one reader within its RCU read-side
200 * critical section. Returns a negative number if an outermost reader
201 * is in the midst of exiting from its RCU read-side critical section
202 *
203 * Returns zero if there are no running readers. Returns a positive
204 * number if there is at least one reader within its RCU read-side
205 * critical section. Returns a negative number if an outermost reader
206 * is in the midst of exiting from its RCU read-side critical section.
207 */
208static int rcu_preempt_running_reader(void)
209{
210 return current->rcu_read_lock_nesting;
211}
212
213/*
214 * Check for preempted RCU readers blocking any grace period.
215 * If the caller needs a reliable answer, it must disable hard irqs.
216 */
217static int rcu_preempt_blocked_readers_any(void)
218{
219 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
220}
221
222/*
223 * Check for preempted RCU readers blocking the current grace period.
224 * If the caller needs a reliable answer, it must disable hard irqs.
225 */
226static int rcu_preempt_blocked_readers_cgp(void)
227{
228 return rcu_preempt_ctrlblk.gp_tasks != NULL;
229}
230
231/*
232 * Return true if another preemptible-RCU grace period is needed.
233 */
234static int rcu_preempt_needs_another_gp(void)
235{
236 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
237}
238
239/*
240 * Return true if a preemptible-RCU grace period is in progress.
241 * The caller must disable hardirqs.
242 */
243static int rcu_preempt_gp_in_progress(void)
244{
245 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
246}
247
248/*
249 * Advance a ->blkd_tasks-list pointer to the next entry, instead
250 * returning NULL if at the end of the list.
251 */
252static struct list_head *rcu_next_node_entry(struct task_struct *t)
253{
254 struct list_head *np;
255
256 np = t->rcu_node_entry.next;
257 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
258 np = NULL;
259 return np;
260}
261
262#ifdef CONFIG_RCU_TRACE
263
264#ifdef CONFIG_RCU_BOOST
265static void rcu_initiate_boost_trace(void);
266#endif /* #ifdef CONFIG_RCU_BOOST */
267
268/*
269 * Dump additional statistice for TINY_PREEMPT_RCU.
270 */
271static void show_tiny_preempt_stats(struct seq_file *m)
272{
273 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
274 rcu_preempt_ctrlblk.rcb.qlen,
275 rcu_preempt_ctrlblk.n_grace_periods,
276 rcu_preempt_ctrlblk.gpnum,
277 rcu_preempt_ctrlblk.gpcpu,
278 rcu_preempt_ctrlblk.completed,
279 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
280 "N."[!rcu_preempt_ctrlblk.gp_tasks],
281 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
282#ifdef CONFIG_RCU_BOOST
283 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
284 " ",
285 "B."[!rcu_preempt_ctrlblk.boost_tasks],
286 rcu_preempt_ctrlblk.n_tasks_boosted,
287 rcu_preempt_ctrlblk.n_exp_boosts,
288 rcu_preempt_ctrlblk.n_normal_boosts,
289 (int)(jiffies & 0xffff),
290 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
291 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
292 " balk",
293 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
294 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
295 rcu_preempt_ctrlblk.n_balk_boost_tasks,
296 rcu_preempt_ctrlblk.n_balk_notyet,
297 rcu_preempt_ctrlblk.n_balk_nos);
298#endif /* #ifdef CONFIG_RCU_BOOST */
299}
300
301#endif /* #ifdef CONFIG_RCU_TRACE */
302
303#ifdef CONFIG_RCU_BOOST
304
305#include "rtmutex_common.h"
306
307#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
308
309/* Controls for rcu_kthread() kthread. */
310static struct task_struct *rcu_kthread_task;
311static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
312static unsigned long have_rcu_kthread_work;
313
314/*
315 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
316 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
317 */
318static int rcu_boost(void)
319{
320 unsigned long flags;
321 struct rt_mutex mtx;
322 struct task_struct *t;
323 struct list_head *tb;
324
325 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
326 rcu_preempt_ctrlblk.exp_tasks == NULL)
327 return 0; /* Nothing to boost. */
328
329 local_irq_save(flags);
330
331 /*
332 * Recheck with irqs disabled: all tasks in need of boosting
333 * might exit their RCU read-side critical sections on their own
334 * if we are preempted just before disabling irqs.
335 */
336 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
337 rcu_preempt_ctrlblk.exp_tasks == NULL) {
338 local_irq_restore(flags);
339 return 0;
340 }
341
342 /*
343 * Preferentially boost tasks blocking expedited grace periods.
344 * This cannot starve the normal grace periods because a second
345 * expedited grace period must boost all blocked tasks, including
346 * those blocking the pre-existing normal grace period.
347 */
348 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
349 tb = rcu_preempt_ctrlblk.exp_tasks;
350 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
351 } else {
352 tb = rcu_preempt_ctrlblk.boost_tasks;
353 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
354 }
355 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
356
357 /*
358 * We boost task t by manufacturing an rt_mutex that appears to
359 * be held by task t. We leave a pointer to that rt_mutex where
360 * task t can find it, and task t will release the mutex when it
361 * exits its outermost RCU read-side critical section. Then
362 * simply acquiring this artificial rt_mutex will boost task
363 * t's priority. (Thanks to tglx for suggesting this approach!)
364 */
365 t = container_of(tb, struct task_struct, rcu_node_entry);
366 rt_mutex_init_proxy_locked(&mtx, t);
367 t->rcu_boost_mutex = &mtx;
368 local_irq_restore(flags);
369 rt_mutex_lock(&mtx);
370 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
371
372 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
373 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
374}
375
376/*
377 * Check to see if it is now time to start boosting RCU readers blocking
378 * the current grace period, and, if so, tell the rcu_kthread_task to
379 * start boosting them. If there is an expedited boost in progress,
380 * we wait for it to complete.
381 *
382 * If there are no blocked readers blocking the current grace period,
383 * return 0 to let the caller know, otherwise return 1. Note that this
384 * return value is independent of whether or not boosting was done.
385 */
386static int rcu_initiate_boost(void)
387{
388 if (!rcu_preempt_blocked_readers_cgp() &&
389 rcu_preempt_ctrlblk.exp_tasks == NULL) {
390 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
391 return 0;
392 }
393 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
394 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
395 rcu_preempt_ctrlblk.boost_tasks == NULL &&
396 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
397 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
398 rcu_preempt_ctrlblk.boost_tasks =
399 rcu_preempt_ctrlblk.gp_tasks;
400 invoke_rcu_callbacks();
401 } else {
402 RCU_TRACE(rcu_initiate_boost_trace());
403 }
404 return 1;
405}
406
407#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
408
409/*
410 * Do priority-boost accounting for the start of a new grace period.
411 */
412static void rcu_preempt_boost_start_gp(void)
413{
414 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
415}
416
417#else /* #ifdef CONFIG_RCU_BOOST */
418
419/*
420 * If there is no RCU priority boosting, we don't initiate boosting,
421 * but we do indicate whether there are blocked readers blocking the
422 * current grace period.
423 */
424static int rcu_initiate_boost(void)
425{
426 return rcu_preempt_blocked_readers_cgp();
427}
428
429/*
430 * If there is no RCU priority boosting, nothing to do at grace-period start.
431 */
432static void rcu_preempt_boost_start_gp(void)
433{
434}
435
436#endif /* else #ifdef CONFIG_RCU_BOOST */
437
438/*
439 * Record a preemptible-RCU quiescent state for the specified CPU. Note
440 * that this just means that the task currently running on the CPU is
441 * in a quiescent state. There might be any number of tasks blocked
442 * while in an RCU read-side critical section.
443 *
444 * Unlike the other rcu_*_qs() functions, callers to this function
445 * must disable irqs in order to protect the assignment to
446 * ->rcu_read_unlock_special.
447 *
448 * Because this is a single-CPU implementation, the only way a grace
449 * period can end is if the CPU is in a quiescent state. The reason is
450 * that a blocked preemptible-RCU reader can exit its critical section
451 * only if the CPU is running it at the time. Therefore, when the
452 * last task blocking the current grace period exits its RCU read-side
453 * critical section, neither the CPU nor blocked tasks will be stopping
454 * the current grace period. (In contrast, SMP implementations
455 * might have CPUs running in RCU read-side critical sections that
456 * block later grace periods -- but this is not possible given only
457 * one CPU.)
458 */
459static void rcu_preempt_cpu_qs(void)
460{
461 /* Record both CPU and task as having responded to current GP. */
462 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
463 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
464
465 /* If there is no GP then there is nothing more to do. */
466 if (!rcu_preempt_gp_in_progress())
467 return;
468 /*
469 * Check up on boosting. If there are readers blocking the
470 * current grace period, leave.
471 */
472 if (rcu_initiate_boost())
473 return;
474
475 /* Advance callbacks. */
476 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
477 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
478 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
479
480 /* If there are no blocked readers, next GP is done instantly. */
481 if (!rcu_preempt_blocked_readers_any())
482 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
483
484 /* If there are done callbacks, cause them to be invoked. */
485 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
486 invoke_rcu_callbacks();
487}
488
489/*
490 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
491 */
492static void rcu_preempt_start_gp(void)
493{
494 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
495
496 /* Official start of GP. */
497 rcu_preempt_ctrlblk.gpnum++;
498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
500
501 /* Any blocked RCU readers block new GP. */
502 if (rcu_preempt_blocked_readers_any())
503 rcu_preempt_ctrlblk.gp_tasks =
504 rcu_preempt_ctrlblk.blkd_tasks.next;
505
506 /* Set up for RCU priority boosting. */
507 rcu_preempt_boost_start_gp();
508
509 /* If there is no running reader, CPU is done with GP. */
510 if (!rcu_preempt_running_reader())
511 rcu_preempt_cpu_qs();
512 }
513}
514
515/*
516 * We have entered the scheduler, and the current task might soon be
517 * context-switched away from. If this task is in an RCU read-side
518 * critical section, we will no longer be able to rely on the CPU to
519 * record that fact, so we enqueue the task on the blkd_tasks list.
520 * If the task started after the current grace period began, as recorded
521 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
522 * before the element referenced by ->gp_tasks (or at the tail if
523 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
524 * The task will dequeue itself when it exits the outermost enclosing
525 * RCU read-side critical section. Therefore, the current grace period
526 * cannot be permitted to complete until the ->gp_tasks pointer becomes
527 * NULL.
528 *
529 * Caller must disable preemption.
530 */
531void rcu_preempt_note_context_switch(void)
532{
533 struct task_struct *t = current;
534 unsigned long flags;
535
536 local_irq_save(flags); /* must exclude scheduler_tick(). */
537 if (rcu_preempt_running_reader() > 0 &&
538 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
539
540 /* Possibly blocking in an RCU read-side critical section. */
541 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
542
543 /*
544 * If this CPU has already checked in, then this task
545 * will hold up the next grace period rather than the
546 * current grace period. Queue the task accordingly.
547 * If the task is queued for the current grace period
548 * (i.e., this CPU has not yet passed through a quiescent
549 * state for the current grace period), then as long
550 * as that task remains queued, the current grace period
551 * cannot end.
552 */
553 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
554 if (rcu_cpu_blocking_cur_gp())
555 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
556 } else if (rcu_preempt_running_reader() < 0 &&
557 t->rcu_read_unlock_special) {
558 /*
559 * Complete exit from RCU read-side critical section on
560 * behalf of preempted instance of __rcu_read_unlock().
561 */
562 rcu_read_unlock_special(t);
563 }
564
565 /*
566 * Either we were not in an RCU read-side critical section to
567 * begin with, or we have now recorded that critical section
568 * globally. Either way, we can now note a quiescent state
569 * for this CPU. Again, if we were in an RCU read-side critical
570 * section, and if that critical section was blocking the current
571 * grace period, then the fact that the task has been enqueued
572 * means that current grace period continues to be blocked.
573 */
574 rcu_preempt_cpu_qs();
575 local_irq_restore(flags);
576}
577
578/*
579 * Handle special cases during rcu_read_unlock(), such as needing to
580 * notify RCU core processing or task having blocked during the RCU
581 * read-side critical section.
582 */
583void rcu_read_unlock_special(struct task_struct *t)
584{
585 int empty;
586 int empty_exp;
587 unsigned long flags;
588 struct list_head *np;
589#ifdef CONFIG_RCU_BOOST
590 struct rt_mutex *rbmp = NULL;
591#endif /* #ifdef CONFIG_RCU_BOOST */
592 int special;
593
594 /*
595 * NMI handlers cannot block and cannot safely manipulate state.
596 * They therefore cannot possibly be special, so just leave.
597 */
598 if (in_nmi())
599 return;
600
601 local_irq_save(flags);
602
603 /*
604 * If RCU core is waiting for this CPU to exit critical section,
605 * let it know that we have done so.
606 */
607 special = t->rcu_read_unlock_special;
608 if (special & RCU_READ_UNLOCK_NEED_QS)
609 rcu_preempt_cpu_qs();
610
611 /* Hardware IRQ handlers cannot block. */
612 if (in_irq() || in_serving_softirq()) {
613 local_irq_restore(flags);
614 return;
615 }
616
617 /* Clean up if blocked during RCU read-side critical section. */
618 if (special & RCU_READ_UNLOCK_BLOCKED) {
619 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
620
621 /*
622 * Remove this task from the ->blkd_tasks list and adjust
623 * any pointers that might have been referencing it.
624 */
625 empty = !rcu_preempt_blocked_readers_cgp();
626 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
627 np = rcu_next_node_entry(t);
628 list_del_init(&t->rcu_node_entry);
629 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
630 rcu_preempt_ctrlblk.gp_tasks = np;
631 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
632 rcu_preempt_ctrlblk.exp_tasks = np;
633#ifdef CONFIG_RCU_BOOST
634 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
635 rcu_preempt_ctrlblk.boost_tasks = np;
636#endif /* #ifdef CONFIG_RCU_BOOST */
637
638 /*
639 * If this was the last task on the current list, and if
640 * we aren't waiting on the CPU, report the quiescent state
641 * and start a new grace period if needed.
642 */
643 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
644 rcu_preempt_cpu_qs();
645 rcu_preempt_start_gp();
646 }
647
648 /*
649 * If this was the last task on the expedited lists,
650 * then we need wake up the waiting task.
651 */
652 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
653 rcu_report_exp_done();
654 }
655#ifdef CONFIG_RCU_BOOST
656 /* Unboost self if was boosted. */
657 if (t->rcu_boost_mutex != NULL) {
658 rbmp = t->rcu_boost_mutex;
659 t->rcu_boost_mutex = NULL;
660 rt_mutex_unlock(rbmp);
661 }
662#endif /* #ifdef CONFIG_RCU_BOOST */
663 local_irq_restore(flags);
664}
665
666/*
667 * Check for a quiescent state from the current CPU. When a task blocks,
668 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
669 * checked elsewhere. This is called from the scheduling-clock interrupt.
670 *
671 * Caller must disable hard irqs.
672 */
673static void rcu_preempt_check_callbacks(void)
674{
675 struct task_struct *t = current;
676
677 if (rcu_preempt_gp_in_progress() &&
678 (!rcu_preempt_running_reader() ||
679 !rcu_cpu_blocking_cur_gp()))
680 rcu_preempt_cpu_qs();
681 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
682 rcu_preempt_ctrlblk.rcb.donetail)
683 invoke_rcu_callbacks();
684 if (rcu_preempt_gp_in_progress() &&
685 rcu_cpu_blocking_cur_gp() &&
686 rcu_preempt_running_reader() > 0)
687 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
688}
689
690/*
691 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
692 * update, so this is invoked from rcu_process_callbacks() to
693 * handle that case. Of course, it is invoked for all flavors of
694 * RCU, but RCU callbacks can appear only on one of the lists, and
695 * neither ->nexttail nor ->donetail can possibly be NULL, so there
696 * is no need for an explicit check.
697 */
698static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
699{
700 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
701 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
702}
703
704/*
705 * Process callbacks for preemptible RCU.
706 */
707static void rcu_preempt_process_callbacks(void)
708{
709 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
710}
711
712/*
713 * Queue a preemptible -RCU callback for invocation after a grace period.
714 */
715void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
716{
717 unsigned long flags;
718
719 debug_rcu_head_queue(head);
720 head->func = func;
721 head->next = NULL;
722
723 local_irq_save(flags);
724 *rcu_preempt_ctrlblk.nexttail = head;
725 rcu_preempt_ctrlblk.nexttail = &head->next;
726 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
727 rcu_preempt_start_gp(); /* checks to see if GP needed. */
728 local_irq_restore(flags);
729}
730EXPORT_SYMBOL_GPL(call_rcu);
731
732/*
733 * synchronize_rcu - wait until a grace period has elapsed.
734 *
735 * Control will return to the caller some time after a full grace
736 * period has elapsed, in other words after all currently executing RCU
737 * read-side critical sections have completed. RCU read-side critical
738 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
739 * and may be nested.
740 */
741void synchronize_rcu(void)
742{
743 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
744 !lock_is_held(&rcu_lock_map) &&
745 !lock_is_held(&rcu_sched_lock_map),
746 "Illegal synchronize_rcu() in RCU read-side critical section");
747
748#ifdef CONFIG_DEBUG_LOCK_ALLOC
749 if (!rcu_scheduler_active)
750 return;
751#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
752
753 WARN_ON_ONCE(rcu_preempt_running_reader());
754 if (!rcu_preempt_blocked_readers_any())
755 return;
756
757 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
758 if (rcu_expedited)
759 synchronize_rcu_expedited();
760 else
761 rcu_barrier();
762}
763EXPORT_SYMBOL_GPL(synchronize_rcu);
764
765static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
766static unsigned long sync_rcu_preempt_exp_count;
767static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
768
769/*
770 * Return non-zero if there are any tasks in RCU read-side critical
771 * sections blocking the current preemptible-RCU expedited grace period.
772 * If there is no preemptible-RCU expedited grace period currently in
773 * progress, returns zero unconditionally.
774 */
775static int rcu_preempted_readers_exp(void)
776{
777 return rcu_preempt_ctrlblk.exp_tasks != NULL;
778}
779
780/*
781 * Report the exit from RCU read-side critical section for the last task
782 * that queued itself during or before the current expedited preemptible-RCU
783 * grace period.
784 */
785static void rcu_report_exp_done(void)
786{
787 wake_up(&sync_rcu_preempt_exp_wq);
788}
789
790/*
791 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
792 * is to rely in the fact that there is but one CPU, and that it is
793 * illegal for a task to invoke synchronize_rcu_expedited() while in a
794 * preemptible-RCU read-side critical section. Therefore, any such
795 * critical sections must correspond to blocked tasks, which must therefore
796 * be on the ->blkd_tasks list. So just record the current head of the
797 * list in the ->exp_tasks pointer, and wait for all tasks including and
798 * after the task pointed to by ->exp_tasks to drain.
799 */
800void synchronize_rcu_expedited(void)
801{
802 unsigned long flags;
803 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
804 unsigned long snap;
805
806 barrier(); /* ensure prior action seen before grace period. */
807
808 WARN_ON_ONCE(rcu_preempt_running_reader());
809
810 /*
811 * Acquire lock so that there is only one preemptible RCU grace
812 * period in flight. Of course, if someone does the expedited
813 * grace period for us while we are acquiring the lock, just leave.
814 */
815 snap = sync_rcu_preempt_exp_count + 1;
816 mutex_lock(&sync_rcu_preempt_exp_mutex);
817 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
818 goto unlock_mb_ret; /* Others did our work for us. */
819
820 local_irq_save(flags);
821
822 /*
823 * All RCU readers have to already be on blkd_tasks because
824 * we cannot legally be executing in an RCU read-side critical
825 * section.
826 */
827
828 /* Snapshot current head of ->blkd_tasks list. */
829 rpcp->exp_tasks = rpcp->blkd_tasks.next;
830 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
831 rpcp->exp_tasks = NULL;
832
833 /* Wait for tail of ->blkd_tasks list to drain. */
834 if (!rcu_preempted_readers_exp()) {
835 local_irq_restore(flags);
836 } else {
837 rcu_initiate_boost();
838 local_irq_restore(flags);
839 wait_event(sync_rcu_preempt_exp_wq,
840 !rcu_preempted_readers_exp());
841 }
842
843 /* Clean up and exit. */
844 barrier(); /* ensure expedited GP seen before counter increment. */
845 sync_rcu_preempt_exp_count++;
846unlock_mb_ret:
847 mutex_unlock(&sync_rcu_preempt_exp_mutex);
848 barrier(); /* ensure subsequent action seen after grace period. */
849}
850EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
851
852/*
853 * Does preemptible RCU need the CPU to stay out of dynticks mode?
854 */
855int rcu_preempt_needs_cpu(void)
856{
857 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
858}
859
860#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
861
862#ifdef CONFIG_RCU_TRACE
863
864/*
865 * Because preemptible RCU does not exist, it is not necessary to
866 * dump out its statistics.
867 */
868static void show_tiny_preempt_stats(struct seq_file *m)
869{
870}
871
872#endif /* #ifdef CONFIG_RCU_TRACE */
873
874/*
875 * Because preemptible RCU does not exist, it never has any callbacks
876 * to check.
877 */
878static void rcu_preempt_check_callbacks(void)
879{
880}
881
882/*
883 * Because preemptible RCU does not exist, it never has any callbacks
884 * to remove.
885 */
886static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
887{
888}
889
890/*
891 * Because preemptible RCU does not exist, it never has any callbacks
892 * to process.
893 */
894static void rcu_preempt_process_callbacks(void)
895{
896}
897
898#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
899
900#ifdef CONFIG_RCU_BOOST
901
902/*
903 * Wake up rcu_kthread() to process callbacks now eligible for invocation
904 * or to boost readers.
905 */
906static void invoke_rcu_callbacks(void)
907{
908 have_rcu_kthread_work = 1;
909 if (rcu_kthread_task != NULL)
910 wake_up(&rcu_kthread_wq);
911}
912
913#ifdef CONFIG_RCU_TRACE
914
915/*
916 * Is the current CPU running the RCU-callbacks kthread?
917 * Caller must have preemption disabled.
918 */
919static bool rcu_is_callbacks_kthread(void)
920{
921 return rcu_kthread_task == current;
922}
923
924#endif /* #ifdef CONFIG_RCU_TRACE */
925
926/*
927 * This kthread invokes RCU callbacks whose grace periods have
928 * elapsed. It is awakened as needed, and takes the place of the
929 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
930 * This is a kthread, but it is never stopped, at least not until
931 * the system goes down.
932 */
933static int rcu_kthread(void *arg)
934{
935 unsigned long work;
936 unsigned long morework;
937 unsigned long flags;
938
939 for (;;) {
940 wait_event_interruptible(rcu_kthread_wq,
941 have_rcu_kthread_work != 0);
942 morework = rcu_boost();
943 local_irq_save(flags);
944 work = have_rcu_kthread_work;
945 have_rcu_kthread_work = morework;
946 local_irq_restore(flags);
947 if (work)
948 rcu_process_callbacks(NULL);
949 schedule_timeout_interruptible(1); /* Leave CPU for others. */
950 }
951
952 return 0; /* Not reached, but needed to shut gcc up. */
953}
954
955/*
956 * Spawn the kthread that invokes RCU callbacks.
957 */
958static int __init rcu_spawn_kthreads(void)
959{
960 struct sched_param sp;
961
962 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
963 sp.sched_priority = RCU_BOOST_PRIO;
964 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
965 return 0;
966}
967early_initcall(rcu_spawn_kthreads);
968
969#else /* #ifdef CONFIG_RCU_BOOST */
970
971/* Hold off callback invocation until early_initcall() time. */
972static int rcu_scheduler_fully_active __read_mostly;
973
974/*
975 * Start up softirq processing of callbacks.
976 */
977void invoke_rcu_callbacks(void)
978{
979 if (rcu_scheduler_fully_active)
980 raise_softirq(RCU_SOFTIRQ);
981}
982
983#ifdef CONFIG_RCU_TRACE
984
985/*
986 * There is no callback kthread, so this thread is never it.
987 */
988static bool rcu_is_callbacks_kthread(void)
989{
990 return false;
991}
992
993#endif /* #ifdef CONFIG_RCU_TRACE */
994
995static int __init rcu_scheduler_really_started(void)
996{
997 rcu_scheduler_fully_active = 1;
998 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
999 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1000 return 0;
1001}
1002early_initcall(rcu_scheduler_really_started);
1003
1004#endif /* #else #ifdef CONFIG_RCU_BOOST */
1005
1006#ifdef CONFIG_DEBUG_LOCK_ALLOC
1007#include <linux/kernel_stat.h>
1008 60
1009/* 61/*
1010 * During boot, we forgive RCU lockdep issues. After this function is 62 * During boot, we forgive RCU lockdep issues. After this function is
@@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void)
1020 72
1021#ifdef CONFIG_RCU_TRACE 73#ifdef CONFIG_RCU_TRACE
1022 74
1023#ifdef CONFIG_RCU_BOOST
1024
1025static void rcu_initiate_boost_trace(void)
1026{
1027 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
1028 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
1029 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
1030 rcu_preempt_ctrlblk.exp_tasks == NULL)
1031 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
1032 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
1033 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
1034 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
1035 rcu_preempt_ctrlblk.n_balk_notyet++;
1036 else
1037 rcu_preempt_ctrlblk.n_balk_nos++;
1038}
1039
1040#endif /* #ifdef CONFIG_RCU_BOOST */
1041
1042static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) 75static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1043{ 76{
1044 unsigned long flags; 77 unsigned long flags;
@@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1053 */ 86 */
1054static int show_tiny_stats(struct seq_file *m, void *unused) 87static int show_tiny_stats(struct seq_file *m, void *unused)
1055{ 88{
1056 show_tiny_preempt_stats(m);
1057 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); 89 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
1058 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); 90 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
1059 return 0; 91 return 0;
@@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney");
1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 135MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1104MODULE_LICENSE("GPL"); 136MODULE_LICENSE("GPL");
1105 137
1106static void check_cpu_stall_preempt(void) 138static void check_cpu_stall(struct rcu_ctrlblk *rcp)
1107{ 139{
1108#ifdef CONFIG_TINY_PREEMPT_RCU 140 unsigned long j;
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb); 141 unsigned long js;
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 142
143 if (rcu_cpu_stall_suppress)
144 return;
145 rcp->ticks_this_gp++;
146 j = jiffies;
147 js = rcp->jiffies_stall;
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
159}
160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{
163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
166}
167
168static void check_cpu_stalls(void)
169{
170 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
171 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
1111} 172}
1112 173
1113#endif /* #ifdef CONFIG_RCU_TRACE */ 174#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e1f3a8c96724..b1fa5510388d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {
695 .name = "srcu_sync" 695 .name = "srcu_sync"
696}; 696};
697 697
698static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
699{
700 return srcu_read_lock_raw(&srcu_ctl);
701}
702
703static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
704{
705 srcu_read_unlock_raw(&srcu_ctl, idx);
706}
707
708static struct rcu_torture_ops srcu_raw_ops = {
709 .init = rcu_sync_torture_init,
710 .readlock = srcu_torture_read_lock_raw,
711 .read_delay = srcu_read_delay,
712 .readunlock = srcu_torture_read_unlock_raw,
713 .completed = srcu_torture_completed,
714 .deferred_free = srcu_torture_deferred_free,
715 .sync = srcu_torture_synchronize,
716 .call = NULL,
717 .cb_barrier = NULL,
718 .stats = srcu_torture_stats,
719 .name = "srcu_raw"
720};
721
722static struct rcu_torture_ops srcu_raw_sync_ops = {
723 .init = rcu_sync_torture_init,
724 .readlock = srcu_torture_read_lock_raw,
725 .read_delay = srcu_read_delay,
726 .readunlock = srcu_torture_read_unlock_raw,
727 .completed = srcu_torture_completed,
728 .deferred_free = rcu_sync_torture_deferred_free,
729 .sync = srcu_torture_synchronize,
730 .call = NULL,
731 .cb_barrier = NULL,
732 .stats = srcu_torture_stats,
733 .name = "srcu_raw_sync"
734};
735
736static void srcu_torture_synchronize_expedited(void) 698static void srcu_torture_synchronize_expedited(void)
737{ 699{
738 synchronize_srcu_expedited(&srcu_ctl); 700 synchronize_srcu_expedited(&srcu_ctl);
@@ -1983,7 +1945,6 @@ rcu_torture_init(void)
1983 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1984 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1985 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, 1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1986 &srcu_raw_ops, &srcu_raw_sync_ops,
1987 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1988 1949
1989 mutex_lock(&fullstop_mutex); 1950 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea67925015..cf3adc6fe001 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -218,8 +218,8 @@ module_param(blimit, long, 0444);
218module_param(qhimark, long, 0444); 218module_param(qhimark, long, 0444);
219module_param(qlowmark, long, 0444); 219module_param(qlowmark, long, 0444);
220 220
221static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_first_fqs = ULONG_MAX;
222static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 222static ulong jiffies_till_next_fqs = ULONG_MAX;
223 223
224module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
225module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
@@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
866 * See Documentation/RCU/stallwarn.txt for info on how to debug 866 * See Documentation/RCU/stallwarn.txt for info on how to debug
867 * RCU CPU stall warnings. 867 * RCU CPU stall warnings.
868 */ 868 */
869 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", 869 pr_err("INFO: %s detected stalls on CPUs/tasks:",
870 rsp->name); 870 rsp->name);
871 print_cpu_stall_info_begin(); 871 print_cpu_stall_info_begin();
872 rcu_for_each_leaf_node(rsp, rnp) { 872 rcu_for_each_leaf_node(rsp, rnp) {
@@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
899 smp_processor_id(), (long)(jiffies - rsp->gp_start), 899 smp_processor_id(), (long)(jiffies - rsp->gp_start),
900 rsp->gpnum, rsp->completed, totqlen); 900 rsp->gpnum, rsp->completed, totqlen);
901 if (ndetected == 0) 901 if (ndetected == 0)
902 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 902 pr_err("INFO: Stall ended before state dump start\n");
903 else if (!trigger_all_cpu_backtrace()) 903 else if (!trigger_all_cpu_backtrace())
904 rcu_dump_cpu_stacks(rsp); 904 rcu_dump_cpu_stacks(rsp);
905 905
@@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
922 * See Documentation/RCU/stallwarn.txt for info on how to debug 922 * See Documentation/RCU/stallwarn.txt for info on how to debug
923 * RCU CPU stall warnings. 923 * RCU CPU stall warnings.
924 */ 924 */
925 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); 925 pr_err("INFO: %s self-detected stall on CPU", rsp->name);
926 print_cpu_stall_info_begin(); 926 print_cpu_stall_info_begin();
927 print_cpu_stall_info(rsp, smp_processor_id()); 927 print_cpu_stall_info(rsp, smp_processor_id());
928 print_cpu_stall_info_end(); 928 print_cpu_stall_info_end();
@@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void)
985} 985}
986 986
987/* 987/*
988 * Update CPU-local rcu_data state to record the newly noticed grace period.
989 * This is used both when we started the grace period and when we notice
990 * that someone else started the grace period. The caller must hold the
991 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
992 * and must have irqs disabled.
993 */
994static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
995{
996 if (rdp->gpnum != rnp->gpnum) {
997 /*
998 * If the current grace period is waiting for this CPU,
999 * set up to detect a quiescent state, otherwise don't
1000 * go looking for one.
1001 */
1002 rdp->gpnum = rnp->gpnum;
1003 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1004 rdp->passed_quiesce = 0;
1005 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1006 zero_cpu_stall_ticks(rdp);
1007 }
1008}
1009
1010static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
1011{
1012 unsigned long flags;
1013 struct rcu_node *rnp;
1014
1015 local_irq_save(flags);
1016 rnp = rdp->mynode;
1017 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
1018 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1019 local_irq_restore(flags);
1020 return;
1021 }
1022 __note_new_gpnum(rsp, rnp, rdp);
1023 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1024}
1025
1026/*
1027 * Did someone else start a new RCU grace period start since we last
1028 * checked? Update local state appropriately if so. Must be called
1029 * on the CPU corresponding to rdp.
1030 */
1031static int
1032check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
1033{
1034 unsigned long flags;
1035 int ret = 0;
1036
1037 local_irq_save(flags);
1038 if (rdp->gpnum != rsp->gpnum) {
1039 note_new_gpnum(rsp, rdp);
1040 ret = 1;
1041 }
1042 local_irq_restore(flags);
1043 return ret;
1044}
1045
1046/*
1047 * Initialize the specified rcu_data structure's callback list to empty. 988 * Initialize the specified rcu_data structure's callback list to empty.
1048 */ 989 */
1049static void init_callback_list(struct rcu_data *rdp) 990static void init_callback_list(struct rcu_data *rdp)
@@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1313} 1254}
1314 1255
1315/* 1256/*
1316 * Advance this CPU's callbacks, but only if the current grace period 1257 * Update CPU-local rcu_data state to record the beginnings and ends of
1317 * has ended. This may be called only from the CPU to whom the rdp 1258 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1318 * belongs. In addition, the corresponding leaf rcu_node structure's 1259 * structure corresponding to the current CPU, and must have irqs disabled.
1319 * ->lock must be held by the caller, with irqs disabled.
1320 */ 1260 */
1321static void 1261static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1322__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1323{ 1262{
1324 /* Did another grace period end? */ 1263 /* Handle the ends of any preceding grace periods first. */
1325 if (rdp->completed == rnp->completed) { 1264 if (rdp->completed == rnp->completed) {
1326 1265
1327 /* No, so just accelerate recent callbacks. */ 1266 /* No grace period end, so just accelerate recent callbacks. */
1328 rcu_accelerate_cbs(rsp, rnp, rdp); 1267 rcu_accelerate_cbs(rsp, rnp, rdp);
1329 1268
1330 } else { 1269 } else {
@@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1335 /* Remember that we saw this grace-period completion. */ 1274 /* Remember that we saw this grace-period completion. */
1336 rdp->completed = rnp->completed; 1275 rdp->completed = rnp->completed;
1337 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1277 }
1338 1278
1279 if (rdp->gpnum != rnp->gpnum) {
1339 /* 1280 /*
1340 * If we were in an extended quiescent state, we may have 1281 * If the current grace period is waiting for this CPU,
1341 * missed some grace periods that others CPUs handled on 1282 * set up to detect a quiescent state, otherwise don't
1342 * our behalf. Catch up with this state to avoid noting 1283 * go looking for one.
1343 * spurious new grace periods. If another grace period
1344 * has started, then rnp->gpnum will have advanced, so
1345 * we will detect this later on. Of course, any quiescent
1346 * states we found for the old GP are now invalid.
1347 */
1348 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
1349 rdp->gpnum = rdp->completed;
1350 rdp->passed_quiesce = 0;
1351 }
1352
1353 /*
1354 * If RCU does not need a quiescent state from this CPU,
1355 * then make sure that this CPU doesn't go looking for one.
1356 */ 1284 */
1357 if ((rnp->qsmask & rdp->grpmask) == 0) 1285 rdp->gpnum = rnp->gpnum;
1358 rdp->qs_pending = 0; 1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1287 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp);
1359 } 1290 }
1360} 1291}
1361 1292
1362/* 1293static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1363 * Advance this CPU's callbacks, but only if the current grace period
1364 * has ended. This may be called only from the CPU to whom the rdp
1365 * belongs.
1366 */
1367static void
1368rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
1369{ 1294{
1370 unsigned long flags; 1295 unsigned long flags;
1371 struct rcu_node *rnp; 1296 struct rcu_node *rnp;
1372 1297
1373 local_irq_save(flags); 1298 local_irq_save(flags);
1374 rnp = rdp->mynode; 1299 rnp = rdp->mynode;
1375 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 1300 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1301 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
1376 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1302 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1377 local_irq_restore(flags); 1303 local_irq_restore(flags);
1378 return; 1304 return;
1379 } 1305 }
1380 __rcu_process_gp_end(rsp, rnp, rdp); 1306 __note_gp_changes(rsp, rnp, rdp);
1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1307 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1382} 1308}
1383 1309
1384/* 1310/*
1385 * Do per-CPU grace-period initialization for running CPU. The caller
1386 * must hold the lock of the leaf rcu_node structure corresponding to
1387 * this CPU.
1388 */
1389static void
1390rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1391{
1392 /* Prior grace period ended, so advance callbacks for current CPU. */
1393 __rcu_process_gp_end(rsp, rnp, rdp);
1394
1395 /* Set state so that this CPU will detect the next quiescent state. */
1396 __note_new_gpnum(rsp, rnp, rdp);
1397}
1398
1399/*
1400 * Initialize a new grace period. 1311 * Initialize a new grace period.
1401 */ 1312 */
1402static int rcu_gp_init(struct rcu_state *rsp) 1313static int rcu_gp_init(struct rcu_state *rsp)
@@ -1444,16 +1355,16 @@ static int rcu_gp_init(struct rcu_state *rsp)
1444 WARN_ON_ONCE(rnp->completed != rsp->completed); 1355 WARN_ON_ONCE(rnp->completed != rsp->completed);
1445 ACCESS_ONCE(rnp->completed) = rsp->completed; 1356 ACCESS_ONCE(rnp->completed) = rsp->completed;
1446 if (rnp == rdp->mynode) 1357 if (rnp == rdp->mynode)
1447 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1358 __note_gp_changes(rsp, rnp, rdp);
1448 rcu_preempt_boost_start_gp(rnp); 1359 rcu_preempt_boost_start_gp(rnp);
1449 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1360 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1450 rnp->level, rnp->grplo, 1361 rnp->level, rnp->grplo,
1451 rnp->grphi, rnp->qsmask); 1362 rnp->grphi, rnp->qsmask);
1452 raw_spin_unlock_irq(&rnp->lock); 1363 raw_spin_unlock_irq(&rnp->lock);
1453#ifdef CONFIG_PROVE_RCU_DELAY 1364#ifdef CONFIG_PROVE_RCU_DELAY
1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && 1365 if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
1455 system_state == SYSTEM_RUNNING) 1366 system_state == SYSTEM_RUNNING)
1456 schedule_timeout_uninterruptible(2); 1367 udelay(200);
1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1368#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1458 cond_resched(); 1369 cond_resched();
1459 } 1370 }
@@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1527 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1438 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1528 rdp = this_cpu_ptr(rsp->rda); 1439 rdp = this_cpu_ptr(rsp->rda);
1529 if (rnp == rdp->mynode) 1440 if (rnp == rdp->mynode)
1530 __rcu_process_gp_end(rsp, rnp, rdp); 1441 __note_gp_changes(rsp, rnp, rdp);
1531 nocb += rcu_future_gp_cleanup(rsp, rnp); 1442 nocb += rcu_future_gp_cleanup(rsp, rnp);
1532 raw_spin_unlock_irq(&rnp->lock); 1443 raw_spin_unlock_irq(&rnp->lock);
1533 cond_resched(); 1444 cond_resched();
@@ -1613,6 +1524,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
1613 } 1524 }
1614} 1525}
1615 1526
1527static void rsp_wakeup(struct irq_work *work)
1528{
1529 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1530
1531 /* Wake up rcu_gp_kthread() to start the grace period. */
1532 wake_up(&rsp->gp_wq);
1533}
1534
1616/* 1535/*
1617 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1536 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1618 * in preparation for detecting the next grace period. The caller must hold 1537 * in preparation for detecting the next grace period. The caller must hold
@@ -1637,8 +1556,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1637 } 1556 }
1638 rsp->gp_flags = RCU_GP_FLAG_INIT; 1557 rsp->gp_flags = RCU_GP_FLAG_INIT;
1639 1558
1640 /* Wake up rcu_gp_kthread() to start the grace period. */ 1559 /*
1641 wake_up(&rsp->gp_wq); 1560 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter
1562 * the wakeup to interrupt context.
1563 */
1564 irq_work_queue(&rsp->wakeup_work);
1642} 1565}
1643 1566
1644/* 1567/*
@@ -1793,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1793static void 1716static void
1794rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 1717rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1795{ 1718{
1796 /* If there is now a new grace period, record and return. */ 1719 /* Check for grace-period ends and beginnings. */
1797 if (check_for_new_grace_period(rsp, rdp)) 1720 note_gp_changes(rsp, rdp);
1798 return;
1799 1721
1800 /* 1722 /*
1801 * Does this CPU still need to do its part for current grace period? 1723 * Does this CPU still need to do its part for current grace period?
@@ -2259,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2259 2181
2260 WARN_ON_ONCE(rdp->beenonline == 0); 2182 WARN_ON_ONCE(rdp->beenonline == 0);
2261 2183
2262 /* Handle the end of a grace period that some other CPU ended. */
2263 rcu_process_gp_end(rsp, rdp);
2264
2265 /* Update RCU state based on any recent quiescent states. */ 2184 /* Update RCU state based on any recent quiescent states. */
2266 rcu_check_quiescent_state(rsp, rdp); 2185 rcu_check_quiescent_state(rsp, rdp);
2267 2186
@@ -2346,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2346 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 2265 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
2347 2266
2348 /* Are we ignoring a completed grace period? */ 2267 /* Are we ignoring a completed grace period? */
2349 rcu_process_gp_end(rsp, rdp); 2268 note_gp_changes(rsp, rdp);
2350 check_for_new_grace_period(rsp, rdp);
2351 2269
2352 /* Start a new grace period if one not already started. */ 2270 /* Start a new grace period if one not already started. */
2353 if (!rcu_gp_in_progress(rsp)) { 2271 if (!rcu_gp_in_progress(rsp)) {
@@ -3235,6 +3153,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3235 3153
3236 rsp->rda = rda; 3154 rsp->rda = rda;
3237 init_waitqueue_head(&rsp->gp_wq); 3155 init_waitqueue_head(&rsp->gp_wq);
3156 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3238 rnp = rsp->level[rcu_num_lvls - 1]; 3157 rnp = rsp->level[rcu_num_lvls - 1];
3239 for_each_possible_cpu(i) { 3158 for_each_possible_cpu(i) {
3240 while (i > rnp->grphi) 3159 while (i > rnp->grphi)
@@ -3252,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3252 */ 3171 */
3253static void __init rcu_init_geometry(void) 3172static void __init rcu_init_geometry(void)
3254{ 3173{
3174 ulong d;
3255 int i; 3175 int i;
3256 int j; 3176 int j;
3257 int n = nr_cpu_ids; 3177 int n = nr_cpu_ids;
3258 int rcu_capacity[MAX_RCU_LVLS + 1]; 3178 int rcu_capacity[MAX_RCU_LVLS + 1];
3259 3179
3180 /*
3181 * Initialize any unspecified boot parameters.
3182 * The default values of jiffies_till_first_fqs and
3183 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
3184 * value, which is a function of HZ, then adding one for each
3185 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
3186 */
3187 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
3188 if (jiffies_till_first_fqs == ULONG_MAX)
3189 jiffies_till_first_fqs = d;
3190 if (jiffies_till_next_fqs == ULONG_MAX)
3191 jiffies_till_next_fqs = d;
3192
3260 /* If the compile-time values are accurate, just leave. */ 3193 /* If the compile-time values are accurate, just leave. */
3261 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3194 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3262 nr_cpu_ids == NR_CPUS) 3195 nr_cpu_ids == NR_CPUS)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff9..4a39d364493c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
30 31
31/* 32/*
32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -342,12 +343,17 @@ struct rcu_data {
342#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 343#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
343#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 344#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
344 345
345#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 346#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
347 /* For jiffies_till_first_fqs and */
348 /* and jiffies_till_next_fqs. */
346 349
347#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 350#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
348 /* to take at least one */ 351 /* delay between bouts of */
349 /* scheduling clock irq */ 352 /* quiescent-state forcing. */
350 /* before ratting on them. */ 353
354#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
355 /* at least one scheduling clock */
356 /* irq before ratting on them. */
351 357
352#define rcu_wait(cond) \ 358#define rcu_wait(cond) \
353do { \ 359do { \
@@ -442,6 +448,7 @@ struct rcu_state {
442 char *name; /* Name of structure. */ 448 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */ 449 char abbr; /* Abbreviated name. */
444 struct list_head flavors; /* List of RCU flavors. */ 450 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */
445}; 452};
446 453
447/* Values for rcu_state structure's gp_flags field. */ 454/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3db5a375d8dd..63098a59216e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5];
53static void __init rcu_bootup_announce_oddness(void) 53static void __init rcu_bootup_announce_oddness(void)
54{ 54{
55#ifdef CONFIG_RCU_TRACE 55#ifdef CONFIG_RCU_TRACE
56 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); 56 pr_info("\tRCU debugfs-based tracing is enabled.\n");
57#endif 57#endif
58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
59 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 59 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
60 CONFIG_RCU_FANOUT); 60 CONFIG_RCU_FANOUT);
61#endif 61#endif
62#ifdef CONFIG_RCU_FANOUT_EXACT 62#ifdef CONFIG_RCU_FANOUT_EXACT
63 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); 63 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
64#endif 64#endif
65#ifdef CONFIG_RCU_FAST_NO_HZ 65#ifdef CONFIG_RCU_FAST_NO_HZ
66 printk(KERN_INFO 66 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
67 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
68#endif 67#endif
69#ifdef CONFIG_PROVE_RCU 68#ifdef CONFIG_PROVE_RCU
70 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); 69 pr_info("\tRCU lockdep checking is enabled.\n");
71#endif 70#endif
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 71#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 72 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 73#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 74#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); 75 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif 76#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 77#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 78 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 79#endif
81#if NUM_RCU_LVL_4 != 0 80#if NUM_RCU_LVL_4 != 0
82 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); 81 pr_info("\tFour-level hierarchy is enabled.\n");
83#endif 82#endif
84 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 83 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 84 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU 87#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE 88#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) { 89 if (!have_rcu_nocb_mask) {
@@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void)
92 have_rcu_nocb_mask = true; 91 have_rcu_nocb_mask = true;
93 } 92 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO 93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tExperimental no-CBs CPU 0\n"); 94 pr_info("\tOffload RCU callbacks from CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask); 95 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tExperimental no-CBs for all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
100 cpumask_setall(rcu_nocb_mask); 99 cpumask_setall(rcu_nocb_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
103 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
104 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
105 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
106 if (rcu_nocb_poll) 105 if (rcu_nocb_poll)
107 pr_info("\tExperimental polled no-CBs CPUs.\n"); 106 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
108 } 107 }
109#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 108#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
110} 109}
@@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
123 */ 122 */
124static void __init rcu_bootup_announce(void) 123static void __init rcu_bootup_announce(void)
125{ 124{
126 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); 125 pr_info("Preemptible hierarchical RCU implementation.\n");
127 rcu_bootup_announce_oddness(); 126 rcu_bootup_announce_oddness();
128} 127}
129 128
@@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 489
491static void rcu_print_task_stall_begin(struct rcu_node *rnp) 490static void rcu_print_task_stall_begin(struct rcu_node *rnp)
492{ 491{
493 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 492 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
494 rnp->level, rnp->grplo, rnp->grphi); 493 rnp->level, rnp->grplo, rnp->grphi);
495} 494}
496 495
497static void rcu_print_task_stall_end(void) 496static void rcu_print_task_stall_end(void)
498{ 497{
499 printk(KERN_CONT "\n"); 498 pr_cont("\n");
500} 499}
501 500
502#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 501#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
526 t = list_entry(rnp->gp_tasks, 525 t = list_entry(rnp->gp_tasks,
527 struct task_struct, rcu_node_entry); 526 struct task_struct, rcu_node_entry);
528 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 527 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
529 printk(KERN_CONT " P%d", t->pid); 528 pr_cont(" P%d", t->pid);
530 ndetected++; 529 ndetected++;
531 } 530 }
532 rcu_print_task_stall_end(); 531 rcu_print_task_stall_end();
@@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void)
933 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 932 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
934} 933}
935 934
935/*
936 * Check for a task exiting while in a preemptible-RCU read-side
937 * critical section, clean up if so. No need to issue warnings,
938 * as debug_check_no_locks_held() already does this if lockdep
939 * is enabled.
940 */
941void exit_rcu(void)
942{
943 struct task_struct *t = current;
944
945 if (likely(list_empty(&current->rcu_node_entry)))
946 return;
947 t->rcu_read_lock_nesting = 1;
948 barrier();
949 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
950 __rcu_read_unlock();
951}
952
936#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 953#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
937 954
938static struct rcu_state *rcu_state = &rcu_sched_state; 955static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state;
942 */ 959 */
943static void __init rcu_bootup_announce(void) 960static void __init rcu_bootup_announce(void)
944{ 961{
945 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 962 pr_info("Hierarchical RCU implementation.\n");
946 rcu_bootup_announce_oddness(); 963 rcu_bootup_announce_oddness();
947} 964}
948 965
@@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void)
1101{ 1118{
1102} 1119}
1103 1120
1121/*
1122 * Because preemptible RCU does not exist, tasks cannot possibly exit
1123 * while in preemptible RCU read-side critical sections.
1124 */
1125void exit_rcu(void)
1126{
1127}
1128
1104#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1129#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1105 1130
1106#ifdef CONFIG_RCU_BOOST 1131#ifdef CONFIG_RCU_BOOST
@@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void)
1629 */ 1654 */
1630 if (rdp->completed != rnp->completed && 1655 if (rdp->completed != rnp->completed &&
1631 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1656 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1632 rcu_process_gp_end(rsp, rdp); 1657 note_gp_changes(rsp, rdp);
1633 1658
1634 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1659 if (cpu_has_callbacks_ready_to_invoke(rdp))
1635 cbs_ready = true; 1660 cbs_ready = true;
@@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1883/* Initiate the stall-info list. */ 1908/* Initiate the stall-info list. */
1884static void print_cpu_stall_info_begin(void) 1909static void print_cpu_stall_info_begin(void)
1885{ 1910{
1886 printk(KERN_CONT "\n"); 1911 pr_cont("\n");
1887} 1912}
1888 1913
1889/* 1914/*
@@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1914 ticks_value = rsp->gpnum - rdp->gpnum; 1939 ticks_value = rsp->gpnum - rdp->gpnum;
1915 } 1940 }
1916 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1941 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1917 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1942 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
1918 cpu, ticks_value, ticks_title, 1943 cpu, ticks_value, ticks_title,
1919 atomic_read(&rdtp->dynticks) & 0xfff, 1944 atomic_read(&rdtp->dynticks) & 0xfff,
1920 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1945 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1925/* Terminate the stall-info list. */ 1950/* Terminate the stall-info list. */
1926static void print_cpu_stall_info_end(void) 1951static void print_cpu_stall_info_end(void)
1927{ 1952{
1928 printk(KERN_ERR "\t"); 1953 pr_err("\t");
1929} 1954}
1930 1955
1931/* Zero ->ticks_this_gp for all flavors of RCU. */ 1956/* Zero ->ticks_this_gp for all flavors of RCU. */
@@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void)
1948 1973
1949static void print_cpu_stall_info_begin(void) 1974static void print_cpu_stall_info_begin(void)
1950{ 1975{
1951 printk(KERN_CONT " {"); 1976 pr_cont(" {");
1952} 1977}
1953 1978
1954static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1979static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1955{ 1980{
1956 printk(KERN_CONT " %d", cpu); 1981 pr_cont(" %d", cpu);
1957} 1982}
1958 1983
1959static void print_cpu_stall_info_end(void) 1984static void print_cpu_stall_info_end(void)
1960{ 1985{
1961 printk(KERN_CONT "} "); 1986 pr_cont("} ");
1962} 1987}
1963 1988
1964static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1989static void zero_cpu_stall_ticks(struct rcu_data *rdp)
diff --git a/kernel/resource.c b/kernel/resource.c
index d7386986e10e..77bf11a86c7d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
409{ 409{
410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
411} 411}
412EXPORT_SYMBOL_GPL(page_is_ram);
412 413
413void __weak arch_remove_reservations(struct resource *avail) 414void __weak arch_remove_reservations(struct resource *avail)
414{ 415{
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 1e09308bf2a1..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -145,6 +145,19 @@ int max_lock_depth = 1024;
145/* 145/*
146 * Adjust the priority chain. Also used for deadlock detection. 146 * Adjust the priority chain. Also used for deadlock detection.
147 * Decreases task's usage by one - may thus free the task. 147 * Decreases task's usage by one - may thus free the task.
148 *
149 * @task: the task owning the mutex (owner) for which a chain walk is probably
150 * needed
151 * @deadlock_detect: do we have to carry out deadlock detection?
152 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
153 * things for a task that has just got its priority adjusted, and
154 * is waiting on a mutex)
155 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
156 * its priority to the mutex owner (can be NULL in the case
157 * depicted above or if the top waiter is gone away and we are
158 * actually deboosting the owner)
159 * @top_task: the current top waiter
160 *
148 * Returns 0 or -EDEADLK. 161 * Returns 0 or -EDEADLK.
149 */ 162 */
150static int rt_mutex_adjust_prio_chain(struct task_struct *task, 163static int rt_mutex_adjust_prio_chain(struct task_struct *task,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e4a1de..54adcf35f495 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9e..4a073539c58e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
77 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
78 goto out_free; 78 goto out_free;
79 79
80 sched_online_group(tg, &root_task_group);
81
82 kref_init(&ag->kref); 80 kref_init(&ag->kref);
83 init_rwsem(&ag->lock); 81 init_rwsem(&ag->lock);
84 ag->id = atomic_inc_return(&autogroup_seq_nr); 82 ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
98#endif 96#endif
99 tg->autogroup = ag; 97 tg->autogroup = ag;
100 98
99 sched_online_group(tg, &root_task_group);
101 return ag; 100 return ag;
102 101
103out_free: 102out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..9b1f2e533b95 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
633static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
634{ 634{
635 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636
637 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
638 return false;
639
640 if (idle_cpu(cpu) && !need_resched())
641 return true;
642
643 /*
644 * We can't run Idle Load Balance on this CPU for this time so we
645 * cancel it and clear NOHZ_BALANCE_KICK
646 */
647 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
648 return false;
637} 649}
638 650
639#else /* CONFIG_NO_HZ_COMMON */ 651#else /* CONFIG_NO_HZ_COMMON */
@@ -667,7 +679,7 @@ void sched_avg_update(struct rq *rq)
667{ 679{
668 s64 period = sched_avg_period(); 680 s64 period = sched_avg_period();
669 681
670 while ((s64)(rq->clock - rq->age_stamp) > period) { 682 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
671 /* 683 /*
672 * Inline assembly required to prevent the compiler 684 * Inline assembly required to prevent the compiler
673 * optimising this loop into a divmod call. 685 * optimising this loop into a divmod call.
@@ -1328,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1328 p->sched_class->task_woken(rq, p); 1340 p->sched_class->task_woken(rq, p);
1329 1341
1330 if (rq->idle_stamp) { 1342 if (rq->idle_stamp) {
1331 u64 delta = rq->clock - rq->idle_stamp; 1343 u64 delta = rq_clock(rq) - rq->idle_stamp;
1332 u64 max = 2*sysctl_sched_migration_cost; 1344 u64 max = 2*sysctl_sched_migration_cost;
1333 1345
1334 if (delta > max) 1346 if (delta > max)
@@ -1365,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1365 1377
1366 rq = __task_rq_lock(p); 1378 rq = __task_rq_lock(p);
1367 if (p->on_rq) { 1379 if (p->on_rq) {
1380 /* check_preempt_curr() may use rq clock */
1381 update_rq_clock(rq);
1368 ttwu_do_wakeup(rq, p, wake_flags); 1382 ttwu_do_wakeup(rq, p, wake_flags);
1369 ret = 1; 1383 ret = 1;
1370 } 1384 }
@@ -1393,8 +1407,9 @@ static void sched_ttwu_pending(void)
1393 1407
1394void scheduler_ipi(void) 1408void scheduler_ipi(void)
1395{ 1409{
1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() 1410 if (llist_empty(&this_rq()->wake_list)
1397 && !tick_nohz_full_cpu(smp_processor_id())) 1411 && !tick_nohz_full_cpu(smp_processor_id())
1412 && !got_nohz_idle_kick())
1398 return; 1413 return;
1399 1414
1400 /* 1415 /*
@@ -1417,7 +1432,7 @@ void scheduler_ipi(void)
1417 /* 1432 /*
1418 * Check if someone kicked us for doing the nohz idle load balance. 1433 * Check if someone kicked us for doing the nohz idle load balance.
1419 */ 1434 */
1420 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1435 if (unlikely(got_nohz_idle_kick())) {
1421 this_rq()->idle_balance = 1; 1436 this_rq()->idle_balance = 1;
1422 raise_softirq_irqoff(SCHED_SOFTIRQ); 1437 raise_softirq_irqoff(SCHED_SOFTIRQ);
1423 } 1438 }
@@ -1596,15 +1611,6 @@ static void __sched_fork(struct task_struct *p)
1596 p->se.vruntime = 0; 1611 p->se.vruntime = 0;
1597 INIT_LIST_HEAD(&p->se.group_node); 1612 INIT_LIST_HEAD(&p->se.group_node);
1598 1613
1599/*
1600 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1601 * removed when useful for applications beyond shares distribution (e.g.
1602 * load-balance).
1603 */
1604#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1605 p->se.avg.runnable_avg_period = 0;
1606 p->se.avg.runnable_avg_sum = 0;
1607#endif
1608#ifdef CONFIG_SCHEDSTATS 1614#ifdef CONFIG_SCHEDSTATS
1609 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1615 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1610#endif 1616#endif
@@ -1748,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p)
1748 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1754 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1749#endif 1755#endif
1750 1756
1757 /* Initialize new task's runnable average */
1758 init_task_runnable_average(p);
1751 rq = __task_rq_lock(p); 1759 rq = __task_rq_lock(p);
1752 activate_task(rq, p, 0); 1760 activate_task(rq, p, 0);
1753 p->on_rq = 1; 1761 p->on_rq = 1;
@@ -2056,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu)
2056 return atomic_read(&this->nr_iowait); 2064 return atomic_read(&this->nr_iowait);
2057} 2065}
2058 2066
2059unsigned long this_cpu_load(void)
2060{
2061 struct rq *this = this_rq();
2062 return this->cpu_load[0];
2063}
2064
2065
2066/*
2067 * Global load-average calculations
2068 *
2069 * We take a distributed and async approach to calculating the global load-avg
2070 * in order to minimize overhead.
2071 *
2072 * The global load average is an exponentially decaying average of nr_running +
2073 * nr_uninterruptible.
2074 *
2075 * Once every LOAD_FREQ:
2076 *
2077 * nr_active = 0;
2078 * for_each_possible_cpu(cpu)
2079 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2080 *
2081 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2082 *
2083 * Due to a number of reasons the above turns in the mess below:
2084 *
2085 * - for_each_possible_cpu() is prohibitively expensive on machines with
2086 * serious number of cpus, therefore we need to take a distributed approach
2087 * to calculating nr_active.
2088 *
2089 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2090 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2091 *
2092 * So assuming nr_active := 0 when we start out -- true per definition, we
2093 * can simply take per-cpu deltas and fold those into a global accumulate
2094 * to obtain the same result. See calc_load_fold_active().
2095 *
2096 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2097 * across the machine, we assume 10 ticks is sufficient time for every
2098 * cpu to have completed this task.
2099 *
2100 * This places an upper-bound on the IRQ-off latency of the machine. Then
2101 * again, being late doesn't loose the delta, just wrecks the sample.
2102 *
2103 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2104 * this would add another cross-cpu cacheline miss and atomic operation
2105 * to the wakeup path. Instead we increment on whatever cpu the task ran
2106 * when it went into uninterruptible state and decrement on whatever cpu
2107 * did the wakeup. This means that only the sum of nr_uninterruptible over
2108 * all cpus yields the correct result.
2109 *
2110 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2111 */
2112
2113/* Variables and functions for calc_load */
2114static atomic_long_t calc_load_tasks;
2115static unsigned long calc_load_update;
2116unsigned long avenrun[3];
2117EXPORT_SYMBOL(avenrun); /* should be removed */
2118
2119/**
2120 * get_avenrun - get the load average array
2121 * @loads: pointer to dest load array
2122 * @offset: offset to add
2123 * @shift: shift count to shift the result left
2124 *
2125 * These values are estimates at best, so no need for locking.
2126 */
2127void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2128{
2129 loads[0] = (avenrun[0] + offset) << shift;
2130 loads[1] = (avenrun[1] + offset) << shift;
2131 loads[2] = (avenrun[2] + offset) << shift;
2132}
2133
2134static long calc_load_fold_active(struct rq *this_rq)
2135{
2136 long nr_active, delta = 0;
2137
2138 nr_active = this_rq->nr_running;
2139 nr_active += (long) this_rq->nr_uninterruptible;
2140
2141 if (nr_active != this_rq->calc_load_active) {
2142 delta = nr_active - this_rq->calc_load_active;
2143 this_rq->calc_load_active = nr_active;
2144 }
2145
2146 return delta;
2147}
2148
2149/*
2150 * a1 = a0 * e + a * (1 - e)
2151 */
2152static unsigned long
2153calc_load(unsigned long load, unsigned long exp, unsigned long active)
2154{
2155 load *= exp;
2156 load += active * (FIXED_1 - exp);
2157 load += 1UL << (FSHIFT - 1);
2158 return load >> FSHIFT;
2159}
2160
2161#ifdef CONFIG_NO_HZ_COMMON
2162/*
2163 * Handle NO_HZ for the global load-average.
2164 *
2165 * Since the above described distributed algorithm to compute the global
2166 * load-average relies on per-cpu sampling from the tick, it is affected by
2167 * NO_HZ.
2168 *
2169 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2170 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2171 * when we read the global state.
2172 *
2173 * Obviously reality has to ruin such a delightfully simple scheme:
2174 *
2175 * - When we go NO_HZ idle during the window, we can negate our sample
2176 * contribution, causing under-accounting.
2177 *
2178 * We avoid this by keeping two idle-delta counters and flipping them
2179 * when the window starts, thus separating old and new NO_HZ load.
2180 *
2181 * The only trick is the slight shift in index flip for read vs write.
2182 *
2183 * 0s 5s 10s 15s
2184 * +10 +10 +10 +10
2185 * |-|-----------|-|-----------|-|-----------|-|
2186 * r:0 0 1 1 0 0 1 1 0
2187 * w:0 1 1 0 0 1 1 0 0
2188 *
2189 * This ensures we'll fold the old idle contribution in this window while
2190 * accumlating the new one.
2191 *
2192 * - When we wake up from NO_HZ idle during the window, we push up our
2193 * contribution, since we effectively move our sample point to a known
2194 * busy state.
2195 *
2196 * This is solved by pushing the window forward, and thus skipping the
2197 * sample, for this cpu (effectively using the idle-delta for this cpu which
2198 * was in effect at the time the window opened). This also solves the issue
2199 * of having to deal with a cpu having been in NOHZ idle for multiple
2200 * LOAD_FREQ intervals.
2201 *
2202 * When making the ILB scale, we should try to pull this in as well.
2203 */
2204static atomic_long_t calc_load_idle[2];
2205static int calc_load_idx;
2206
2207static inline int calc_load_write_idx(void)
2208{
2209 int idx = calc_load_idx;
2210
2211 /*
2212 * See calc_global_nohz(), if we observe the new index, we also
2213 * need to observe the new update time.
2214 */
2215 smp_rmb();
2216
2217 /*
2218 * If the folding window started, make sure we start writing in the
2219 * next idle-delta.
2220 */
2221 if (!time_before(jiffies, calc_load_update))
2222 idx++;
2223
2224 return idx & 1;
2225}
2226
2227static inline int calc_load_read_idx(void)
2228{
2229 return calc_load_idx & 1;
2230}
2231
2232void calc_load_enter_idle(void)
2233{
2234 struct rq *this_rq = this_rq();
2235 long delta;
2236
2237 /*
2238 * We're going into NOHZ mode, if there's any pending delta, fold it
2239 * into the pending idle delta.
2240 */
2241 delta = calc_load_fold_active(this_rq);
2242 if (delta) {
2243 int idx = calc_load_write_idx();
2244 atomic_long_add(delta, &calc_load_idle[idx]);
2245 }
2246}
2247
2248void calc_load_exit_idle(void)
2249{
2250 struct rq *this_rq = this_rq();
2251
2252 /*
2253 * If we're still before the sample window, we're done.
2254 */
2255 if (time_before(jiffies, this_rq->calc_load_update))
2256 return;
2257
2258 /*
2259 * We woke inside or after the sample window, this means we're already
2260 * accounted through the nohz accounting, so skip the entire deal and
2261 * sync up for the next window.
2262 */
2263 this_rq->calc_load_update = calc_load_update;
2264 if (time_before(jiffies, this_rq->calc_load_update + 10))
2265 this_rq->calc_load_update += LOAD_FREQ;
2266}
2267
2268static long calc_load_fold_idle(void)
2269{
2270 int idx = calc_load_read_idx();
2271 long delta = 0;
2272
2273 if (atomic_long_read(&calc_load_idle[idx]))
2274 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2275
2276 return delta;
2277}
2278
2279/**
2280 * fixed_power_int - compute: x^n, in O(log n) time
2281 *
2282 * @x: base of the power
2283 * @frac_bits: fractional bits of @x
2284 * @n: power to raise @x to.
2285 *
2286 * By exploiting the relation between the definition of the natural power
2287 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2288 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2289 * (where: n_i \elem {0, 1}, the binary vector representing n),
2290 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2291 * of course trivially computable in O(log_2 n), the length of our binary
2292 * vector.
2293 */
2294static unsigned long
2295fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2296{
2297 unsigned long result = 1UL << frac_bits;
2298
2299 if (n) for (;;) {
2300 if (n & 1) {
2301 result *= x;
2302 result += 1UL << (frac_bits - 1);
2303 result >>= frac_bits;
2304 }
2305 n >>= 1;
2306 if (!n)
2307 break;
2308 x *= x;
2309 x += 1UL << (frac_bits - 1);
2310 x >>= frac_bits;
2311 }
2312
2313 return result;
2314}
2315
2316/*
2317 * a1 = a0 * e + a * (1 - e)
2318 *
2319 * a2 = a1 * e + a * (1 - e)
2320 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2321 * = a0 * e^2 + a * (1 - e) * (1 + e)
2322 *
2323 * a3 = a2 * e + a * (1 - e)
2324 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2325 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2326 *
2327 * ...
2328 *
2329 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2330 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2331 * = a0 * e^n + a * (1 - e^n)
2332 *
2333 * [1] application of the geometric series:
2334 *
2335 * n 1 - x^(n+1)
2336 * S_n := \Sum x^i = -------------
2337 * i=0 1 - x
2338 */
2339static unsigned long
2340calc_load_n(unsigned long load, unsigned long exp,
2341 unsigned long active, unsigned int n)
2342{
2343
2344 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2345}
2346
2347/*
2348 * NO_HZ can leave us missing all per-cpu ticks calling
2349 * calc_load_account_active(), but since an idle CPU folds its delta into
2350 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2351 * in the pending idle delta if our idle period crossed a load cycle boundary.
2352 *
2353 * Once we've updated the global active value, we need to apply the exponential
2354 * weights adjusted to the number of cycles missed.
2355 */
2356static void calc_global_nohz(void)
2357{
2358 long delta, active, n;
2359
2360 if (!time_before(jiffies, calc_load_update + 10)) {
2361 /*
2362 * Catch-up, fold however many we are behind still
2363 */
2364 delta = jiffies - calc_load_update - 10;
2365 n = 1 + (delta / LOAD_FREQ);
2366
2367 active = atomic_long_read(&calc_load_tasks);
2368 active = active > 0 ? active * FIXED_1 : 0;
2369
2370 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2371 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2372 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2373
2374 calc_load_update += n * LOAD_FREQ;
2375 }
2376
2377 /*
2378 * Flip the idle index...
2379 *
2380 * Make sure we first write the new time then flip the index, so that
2381 * calc_load_write_idx() will see the new time when it reads the new
2382 * index, this avoids a double flip messing things up.
2383 */
2384 smp_wmb();
2385 calc_load_idx++;
2386}
2387#else /* !CONFIG_NO_HZ_COMMON */
2388
2389static inline long calc_load_fold_idle(void) { return 0; }
2390static inline void calc_global_nohz(void) { }
2391
2392#endif /* CONFIG_NO_HZ_COMMON */
2393
2394/*
2395 * calc_load - update the avenrun load estimates 10 ticks after the
2396 * CPUs have updated calc_load_tasks.
2397 */
2398void calc_global_load(unsigned long ticks)
2399{
2400 long active, delta;
2401
2402 if (time_before(jiffies, calc_load_update + 10))
2403 return;
2404
2405 /*
2406 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2407 */
2408 delta = calc_load_fold_idle();
2409 if (delta)
2410 atomic_long_add(delta, &calc_load_tasks);
2411
2412 active = atomic_long_read(&calc_load_tasks);
2413 active = active > 0 ? active * FIXED_1 : 0;
2414
2415 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2416 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2417 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2418
2419 calc_load_update += LOAD_FREQ;
2420
2421 /*
2422 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2423 */
2424 calc_global_nohz();
2425}
2426
2427/*
2428 * Called from update_cpu_load() to periodically update this CPU's
2429 * active count.
2430 */
2431static void calc_load_account_active(struct rq *this_rq)
2432{
2433 long delta;
2434
2435 if (time_before(jiffies, this_rq->calc_load_update))
2436 return;
2437
2438 delta = calc_load_fold_active(this_rq);
2439 if (delta)
2440 atomic_long_add(delta, &calc_load_tasks);
2441
2442 this_rq->calc_load_update += LOAD_FREQ;
2443}
2444
2445/*
2446 * End of global load-average stuff
2447 */
2448
2449/*
2450 * The exact cpuload at various idx values, calculated at every tick would be
2451 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2452 *
2453 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2454 * on nth tick when cpu may be busy, then we have:
2455 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2456 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2457 *
2458 * decay_load_missed() below does efficient calculation of
2459 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2460 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2461 *
2462 * The calculation is approximated on a 128 point scale.
2463 * degrade_zero_ticks is the number of ticks after which load at any
2464 * particular idx is approximated to be zero.
2465 * degrade_factor is a precomputed table, a row for each load idx.
2466 * Each column corresponds to degradation factor for a power of two ticks,
2467 * based on 128 point scale.
2468 * Example:
2469 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2470 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2471 *
2472 * With this power of 2 load factors, we can degrade the load n times
2473 * by looking at 1 bits in n and doing as many mult/shift instead of
2474 * n mult/shifts needed by the exact degradation.
2475 */
2476#define DEGRADE_SHIFT 7
2477static const unsigned char
2478 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2479static const unsigned char
2480 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2481 {0, 0, 0, 0, 0, 0, 0, 0},
2482 {64, 32, 8, 0, 0, 0, 0, 0},
2483 {96, 72, 40, 12, 1, 0, 0},
2484 {112, 98, 75, 43, 15, 1, 0},
2485 {120, 112, 98, 76, 45, 16, 2} };
2486
2487/*
2488 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2489 * would be when CPU is idle and so we just decay the old load without
2490 * adding any new load.
2491 */
2492static unsigned long
2493decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2494{
2495 int j = 0;
2496
2497 if (!missed_updates)
2498 return load;
2499
2500 if (missed_updates >= degrade_zero_ticks[idx])
2501 return 0;
2502
2503 if (idx == 1)
2504 return load >> missed_updates;
2505
2506 while (missed_updates) {
2507 if (missed_updates % 2)
2508 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2509
2510 missed_updates >>= 1;
2511 j++;
2512 }
2513 return load;
2514}
2515
2516/*
2517 * Update rq->cpu_load[] statistics. This function is usually called every
2518 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2519 * every tick. We fix it up based on jiffies.
2520 */
2521static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2522 unsigned long pending_updates)
2523{
2524 int i, scale;
2525
2526 this_rq->nr_load_updates++;
2527
2528 /* Update our load: */
2529 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2530 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2531 unsigned long old_load, new_load;
2532
2533 /* scale is effectively 1 << i now, and >> i divides by scale */
2534
2535 old_load = this_rq->cpu_load[i];
2536 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2537 new_load = this_load;
2538 /*
2539 * Round up the averaging division if load is increasing. This
2540 * prevents us from getting stuck on 9 if the load is 10, for
2541 * example.
2542 */
2543 if (new_load > old_load)
2544 new_load += scale - 1;
2545
2546 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2547 }
2548
2549 sched_avg_update(this_rq);
2550}
2551
2552#ifdef CONFIG_NO_HZ_COMMON
2553/*
2554 * There is no sane way to deal with nohz on smp when using jiffies because the
2555 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2556 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2557 *
2558 * Therefore we cannot use the delta approach from the regular tick since that
2559 * would seriously skew the load calculation. However we'll make do for those
2560 * updates happening while idle (nohz_idle_balance) or coming out of idle
2561 * (tick_nohz_idle_exit).
2562 *
2563 * This means we might still be one tick off for nohz periods.
2564 */
2565
2566/*
2567 * Called from nohz_idle_balance() to update the load ratings before doing the
2568 * idle balance.
2569 */
2570void update_idle_cpu_load(struct rq *this_rq)
2571{
2572 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2573 unsigned long load = this_rq->load.weight;
2574 unsigned long pending_updates;
2575
2576 /*
2577 * bail if there's load or we're actually up-to-date.
2578 */
2579 if (load || curr_jiffies == this_rq->last_load_update_tick)
2580 return;
2581
2582 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2583 this_rq->last_load_update_tick = curr_jiffies;
2584
2585 __update_cpu_load(this_rq, load, pending_updates);
2586}
2587
2588/*
2589 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2590 */
2591void update_cpu_load_nohz(void)
2592{
2593 struct rq *this_rq = this_rq();
2594 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2595 unsigned long pending_updates;
2596
2597 if (curr_jiffies == this_rq->last_load_update_tick)
2598 return;
2599
2600 raw_spin_lock(&this_rq->lock);
2601 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2602 if (pending_updates) {
2603 this_rq->last_load_update_tick = curr_jiffies;
2604 /*
2605 * We were idle, this means load 0, the current load might be
2606 * !0 due to remote wakeups and the sort.
2607 */
2608 __update_cpu_load(this_rq, 0, pending_updates);
2609 }
2610 raw_spin_unlock(&this_rq->lock);
2611}
2612#endif /* CONFIG_NO_HZ_COMMON */
2613
2614/*
2615 * Called from scheduler_tick()
2616 */
2617static void update_cpu_load_active(struct rq *this_rq)
2618{
2619 /*
2620 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2621 */
2622 this_rq->last_load_update_tick = jiffies;
2623 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2624
2625 calc_load_account_active(this_rq);
2626}
2627
2628#ifdef CONFIG_SMP 2067#ifdef CONFIG_SMP
2629 2068
2630/* 2069/*
@@ -2673,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2673 2112
2674 if (task_current(rq, p)) { 2113 if (task_current(rq, p)) {
2675 update_rq_clock(rq); 2114 update_rq_clock(rq);
2676 ns = rq->clock_task - p->se.exec_start; 2115 ns = rq_clock_task(rq) - p->se.exec_start;
2677 if ((s64)ns < 0) 2116 if ((s64)ns < 0)
2678 ns = 0; 2117 ns = 0;
2679 } 2118 }
@@ -2726,8 +2165,8 @@ void scheduler_tick(void)
2726 2165
2727 raw_spin_lock(&rq->lock); 2166 raw_spin_lock(&rq->lock);
2728 update_rq_clock(rq); 2167 update_rq_clock(rq);
2729 update_cpu_load_active(rq);
2730 curr->sched_class->task_tick(rq, curr, 0); 2168 curr->sched_class->task_tick(rq, curr, 0);
2169 update_cpu_load_active(rq);
2731 raw_spin_unlock(&rq->lock); 2170 raw_spin_unlock(&rq->lock);
2732 2171
2733 perf_event_task_tick(); 2172 perf_event_task_tick();
@@ -4745,7 +4184,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4745 */ 4184 */
4746 idle->sched_class = &idle_sched_class; 4185 idle->sched_class = &idle_sched_class;
4747 ftrace_graph_init_idle_task(idle, cpu); 4186 ftrace_graph_init_idle_task(idle, cpu);
4748 vtime_init_idle(idle); 4187 vtime_init_idle(idle, cpu);
4749#if defined(CONFIG_SMP) 4188#if defined(CONFIG_SMP)
4750 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4189 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4751#endif 4190#endif
@@ -4947,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu)
4947 */ 4386 */
4948 rq->stop = NULL; 4387 rq->stop = NULL;
4949 4388
4389 /*
4390 * put_prev_task() and pick_next_task() sched
4391 * class method both need to have an up-to-date
4392 * value of rq->clock[_task]
4393 */
4394 update_rq_clock(rq);
4395
4950 for ( ; ; ) { 4396 for ( ; ; ) {
4951 /* 4397 /*
4952 * There's this thread running, bail when that's the only 4398 * There's this thread running, bail when that's the only
@@ -5080,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5080 return table; 4526 return table;
5081} 4527}
5082 4528
5083static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4529static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5084{ 4530{
5085 struct ctl_table *entry, *table; 4531 struct ctl_table *entry, *table;
5086 struct sched_domain *sd; 4532 struct sched_domain *sd;
@@ -5894,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5894 get_group(cpu, sdd, &sd->groups); 5340 get_group(cpu, sdd, &sd->groups);
5895 atomic_inc(&sd->groups->ref); 5341 atomic_inc(&sd->groups->ref);
5896 5342
5897 if (cpu != cpumask_first(sched_domain_span(sd))) 5343 if (cpu != cpumask_first(span))
5898 return 0; 5344 return 0;
5899 5345
5900 lockdep_assert_held(&sched_domains_mutex); 5346 lockdep_assert_held(&sched_domains_mutex);
@@ -5904,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5904 5350
5905 for_each_cpu(i, span) { 5351 for_each_cpu(i, span) {
5906 struct sched_group *sg; 5352 struct sched_group *sg;
5907 int group = get_group(i, sdd, &sg); 5353 int group, j;
5908 int j;
5909 5354
5910 if (cpumask_test_cpu(i, covered)) 5355 if (cpumask_test_cpu(i, covered))
5911 continue; 5356 continue;
5912 5357
5358 group = get_group(i, sdd, &sg);
5913 cpumask_clear(sched_group_cpus(sg)); 5359 cpumask_clear(sched_group_cpus(sg));
5914 sg->sgp->power = 0; 5360 sg->sgp->power = 0;
5915 cpumask_setall(sched_group_mask(sg)); 5361 cpumask_setall(sched_group_mask(sg));
@@ -5947,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5947{ 5393{
5948 struct sched_group *sg = sd->groups; 5394 struct sched_group *sg = sd->groups;
5949 5395
5950 WARN_ON(!sd || !sg); 5396 WARN_ON(!sg);
5951 5397
5952 do { 5398 do {
5953 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5399 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6112,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = {
6112 5558
6113static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5559static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6114 5560
5561#define for_each_sd_topology(tl) \
5562 for (tl = sched_domain_topology; tl->init; tl++)
5563
6115#ifdef CONFIG_NUMA 5564#ifdef CONFIG_NUMA
6116 5565
6117static int sched_domains_numa_levels; 5566static int sched_domains_numa_levels;
@@ -6409,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6409 struct sched_domain_topology_level *tl; 5858 struct sched_domain_topology_level *tl;
6410 int j; 5859 int j;
6411 5860
6412 for (tl = sched_domain_topology; tl->init; tl++) { 5861 for_each_sd_topology(tl) {
6413 struct sd_data *sdd = &tl->data; 5862 struct sd_data *sdd = &tl->data;
6414 5863
6415 sdd->sd = alloc_percpu(struct sched_domain *); 5864 sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6462,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
6462 struct sched_domain_topology_level *tl; 5911 struct sched_domain_topology_level *tl;
6463 int j; 5912 int j;
6464 5913
6465 for (tl = sched_domain_topology; tl->init; tl++) { 5914 for_each_sd_topology(tl) {
6466 struct sd_data *sdd = &tl->data; 5915 struct sd_data *sdd = &tl->data;
6467 5916
6468 for_each_cpu(j, cpu_map) { 5917 for_each_cpu(j, cpu_map) {
@@ -6490,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6490} 5939}
6491 5940
6492struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5941struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6493 struct s_data *d, const struct cpumask *cpu_map, 5942 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6494 struct sched_domain_attr *attr, struct sched_domain *child, 5943 struct sched_domain *child, int cpu)
6495 int cpu)
6496{ 5944{
6497 struct sched_domain *sd = tl->init(tl, cpu); 5945 struct sched_domain *sd = tl->init(tl, cpu);
6498 if (!sd) 5946 if (!sd)
@@ -6503,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6503 sd->level = child->level + 1; 5951 sd->level = child->level + 1;
6504 sched_domain_level_max = max(sched_domain_level_max, sd->level); 5952 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6505 child->parent = sd; 5953 child->parent = sd;
5954 sd->child = child;
6506 } 5955 }
6507 sd->child = child;
6508 set_domain_attribute(sd, attr); 5956 set_domain_attribute(sd, attr);
6509 5957
6510 return sd; 5958 return sd;
@@ -6517,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6517static int build_sched_domains(const struct cpumask *cpu_map, 5965static int build_sched_domains(const struct cpumask *cpu_map,
6518 struct sched_domain_attr *attr) 5966 struct sched_domain_attr *attr)
6519{ 5967{
6520 enum s_alloc alloc_state = sa_none; 5968 enum s_alloc alloc_state;
6521 struct sched_domain *sd; 5969 struct sched_domain *sd;
6522 struct s_data d; 5970 struct s_data d;
6523 int i, ret = -ENOMEM; 5971 int i, ret = -ENOMEM;
@@ -6531,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_topology_level *tl; 5979 struct sched_domain_topology_level *tl;
6532 5980
6533 sd = NULL; 5981 sd = NULL;
6534 for (tl = sched_domain_topology; tl->init; tl++) { 5982 for_each_sd_topology(tl) {
6535 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 5983 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
5984 if (tl == sched_domain_topology)
5985 *per_cpu_ptr(d.sd, i) = sd;
6536 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 5986 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6537 sd->flags |= SD_OVERLAP; 5987 sd->flags |= SD_OVERLAP;
6538 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 5988 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6539 break; 5989 break;
6540 } 5990 }
6541
6542 while (sd->child)
6543 sd = sd->child;
6544
6545 *per_cpu_ptr(d.sd, i) = sd;
6546 } 5991 }
6547 5992
6548 /* Build the groups for the domains */ 5993 /* Build the groups for the domains */
@@ -6854,9 +6299,6 @@ void __init sched_init_smp(void)
6854 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6299 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6855 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6300 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6856 6301
6857 /* RT runtime code needs to handle some hotplug events */
6858 hotcpu_notifier(update_runtime, 0);
6859
6860 init_hrtick(); 6302 init_hrtick();
6861 6303
6862 /* Move init over to a non-isolated CPU */ 6304 /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a3..a7959e05a9d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515 515
516 for (;;) { 516 for (;;) {
517 /* Make sure "rtime" is the bigger of stime/rtime */ 517 /* Make sure "rtime" is the bigger of stime/rtime */
518 if (stime > rtime) { 518 if (stime > rtime)
519 u64 tmp = rtime; rtime = stime; stime = tmp; 519 swap(rtime, stime);
520 }
521 520
522 /* Make sure 'total' fits in 32 bits */ 521 /* Make sure 'total' fits in 32 bits */
523 if (total >> 32) 522 if (total >> 32)
@@ -747,17 +746,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
747 746
748 write_seqlock(&current->vtime_seqlock); 747 write_seqlock(&current->vtime_seqlock);
749 current->vtime_snap_whence = VTIME_SYS; 748 current->vtime_snap_whence = VTIME_SYS;
750 current->vtime_snap = sched_clock(); 749 current->vtime_snap = sched_clock_cpu(smp_processor_id());
751 write_sequnlock(&current->vtime_seqlock); 750 write_sequnlock(&current->vtime_seqlock);
752} 751}
753 752
754void vtime_init_idle(struct task_struct *t) 753void vtime_init_idle(struct task_struct *t, int cpu)
755{ 754{
756 unsigned long flags; 755 unsigned long flags;
757 756
758 write_seqlock_irqsave(&t->vtime_seqlock, flags); 757 write_seqlock_irqsave(&t->vtime_seqlock, flags);
759 t->vtime_snap_whence = VTIME_SYS; 758 t->vtime_snap_whence = VTIME_SYS;
760 t->vtime_snap = sched_clock(); 759 t->vtime_snap = sched_clock_cpu(cpu);
761 write_sequnlock_irqrestore(&t->vtime_seqlock, flags); 760 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
762} 761}
763 762
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..e076bddd4c66 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
209 cfs_rq->nr_spread_over); 209 cfs_rq->nr_spread_over);
210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
212#ifdef CONFIG_FAIR_GROUP_SCHED
213#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
214 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", 213 SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
215 cfs_rq->runnable_load_avg); 214 cfs_rq->runnable_load_avg);
216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
217 cfs_rq->blocked_load_avg); 216 cfs_rq->blocked_load_avg);
218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", 217#ifdef CONFIG_FAIR_GROUP_SCHED
219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); 218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
221 cfs_rq->tg_load_contrib); 219 cfs_rq->tg_load_contrib);
222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 220 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
223 cfs_rq->tg_runnable_contrib); 221 cfs_rq->tg_runnable_contrib);
222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
223 atomic_long_read(&cfs_rq->tg->load_avg));
224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", 224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
225 atomic_read(&cfs_rq->tg->runnable_avg)); 225 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 226#endif
227#endif
227 228
229#ifdef CONFIG_FAIR_GROUP_SCHED
228 print_cfs_group_stats(m, cpu, cfs_rq->tg); 230 print_cfs_group_stats(m, cpu, cfs_rq->tg);
229#endif 231#endif
230} 232}
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
494 get_nr_threads(p)); 496 get_nr_threads(p));
495 SEQ_printf(m, 497 SEQ_printf(m,
496 "---------------------------------------------------------\n"); 498 "---------------------------------------------------------"
499 "----------\n");
497#define __P(F) \ 500#define __P(F) \
498 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) 501 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
499#define P(F) \ 502#define P(F) \
500 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) 503 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
501#define __PN(F) \ 504#define __PN(F) \
502 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 505 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
503#define PN(F) \ 506#define PN(F) \
504 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 507 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
505 508
506 PN(se.exec_start); 509 PN(se.exec_start);
507 PN(se.vruntime); 510 PN(se.vruntime);
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
560 } 563 }
561#endif 564#endif
562 __P(nr_switches); 565 __P(nr_switches);
563 SEQ_printf(m, "%-35s:%21Ld\n", 566 SEQ_printf(m, "%-45s:%21Ld\n",
564 "nr_voluntary_switches", (long long)p->nvcsw); 567 "nr_voluntary_switches", (long long)p->nvcsw);
565 SEQ_printf(m, "%-35s:%21Ld\n", 568 SEQ_printf(m, "%-45s:%21Ld\n",
566 "nr_involuntary_switches", (long long)p->nivcsw); 569 "nr_involuntary_switches", (long long)p->nivcsw);
567 570
568 P(se.load.weight); 571 P(se.load.weight);
572#ifdef CONFIG_SMP
573 P(se.avg.runnable_avg_sum);
574 P(se.avg.runnable_avg_period);
575 P(se.avg.load_avg_contrib);
576 P(se.avg.decay_count);
577#endif
569 P(policy); 578 P(policy);
570 P(prio); 579 P(prio);
571#undef PN 580#undef PN
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
579 588
580 t0 = cpu_clock(this_cpu); 589 t0 = cpu_clock(this_cpu);
581 t1 = cpu_clock(this_cpu); 590 t1 = cpu_clock(this_cpu);
582 SEQ_printf(m, "%-35s:%21Ld\n", 591 SEQ_printf(m, "%-45s:%21Ld\n",
583 "clock-delta", (long long)(t1-t0)); 592 "clock-delta", (long long)(t1-t0));
584 } 593 }
585} 594}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c8..f77f9c527449 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif 114#endif
115 115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
116/* 134/*
117 * Increase the granularity value when there are more CPUs, 135 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible 136 * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
662 return calc_delta_fair(sched_slice(cfs_rq, se), se); 680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
663} 681}
664 682
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686/* Give new task start runnable values to heavy its load in infant time */
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
665/* 703/*
666 * Update the current task's runtime statistics. Skip current tasks that 704 * Update the current task's runtime statistics. Skip current tasks that
667 * are not in our scheduling class. 705 * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
686static void update_curr(struct cfs_rq *cfs_rq) 724static void update_curr(struct cfs_rq *cfs_rq)
687{ 725{
688 struct sched_entity *curr = cfs_rq->curr; 726 struct sched_entity *curr = cfs_rq->curr;
689 u64 now = rq_of(cfs_rq)->clock_task; 727 u64 now = rq_clock_task(rq_of(cfs_rq));
690 unsigned long delta_exec; 728 unsigned long delta_exec;
691 729
692 if (unlikely(!curr)) 730 if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
718static inline void 756static inline void
719update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{ 758{
721 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); 759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
722} 760}
723 761
724/* 762/*
@@ -738,14 +776,14 @@ static void
738update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
739{ 777{
740 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, 778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 rq_of(cfs_rq)->clock - se->statistics.wait_start)); 779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
742 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); 780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + 781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 rq_of(cfs_rq)->clock - se->statistics.wait_start); 782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
745#ifdef CONFIG_SCHEDSTATS 783#ifdef CONFIG_SCHEDSTATS
746 if (entity_is_task(se)) { 784 if (entity_is_task(se)) {
747 trace_sched_stat_wait(task_of(se), 785 trace_sched_stat_wait(task_of(se),
748 rq_of(cfs_rq)->clock - se->statistics.wait_start); 786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
749 } 787 }
750#endif 788#endif
751 schedstat_set(se->statistics.wait_start, 0); 789 schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
771 /* 809 /*
772 * We are starting a new run period: 810 * We are starting a new run period:
773 */ 811 */
774 se->exec_start = rq_of(cfs_rq)->clock_task; 812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
775} 813}
776 814
777/************************************************** 815/**************************************************
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1037 * to gain a more accurate current total weight. See 1075 * to gain a more accurate current total weight. See
1038 * update_cfs_rq_load_contribution(). 1076 * update_cfs_rq_load_contribution().
1039 */ 1077 */
1040 tg_weight = atomic64_read(&tg->load_avg); 1078 tg_weight = atomic_long_read(&tg->load_avg);
1041 tg_weight -= cfs_rq->tg_load_contrib; 1079 tg_weight -= cfs_rq->tg_load_contrib;
1042 tg_weight += cfs_rq->load.weight; 1080 tg_weight += cfs_rq->load.weight;
1043 1081
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1110} 1148}
1111#endif /* CONFIG_FAIR_GROUP_SCHED */ 1149#endif /* CONFIG_FAIR_GROUP_SCHED */
1112 1150
1113/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ 1151#ifdef CONFIG_SMP
1114#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1115/* 1152/*
1116 * We choose a half-life close to 1 scheduling period. 1153 * We choose a half-life close to 1 scheduling period.
1117 * Note: The tables below are dependent on this value. 1154 * Note: The tables below are dependent on this value.
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1319 int force_update) 1356 int force_update)
1320{ 1357{
1321 struct task_group *tg = cfs_rq->tg; 1358 struct task_group *tg = cfs_rq->tg;
1322 s64 tg_contrib; 1359 long tg_contrib;
1323 1360
1324 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 1361 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1325 tg_contrib -= cfs_rq->tg_load_contrib; 1362 tg_contrib -= cfs_rq->tg_load_contrib;
1326 1363
1327 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 1364 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1328 atomic64_add(tg_contrib, &tg->load_avg); 1365 atomic_long_add(tg_contrib, &tg->load_avg);
1329 cfs_rq->tg_load_contrib += tg_contrib; 1366 cfs_rq->tg_load_contrib += tg_contrib;
1330 } 1367 }
1331} 1368}
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
1360 u64 contrib; 1397 u64 contrib;
1361 1398
1362 contrib = cfs_rq->tg_load_contrib * tg->shares; 1399 contrib = cfs_rq->tg_load_contrib * tg->shares;
1363 se->avg.load_avg_contrib = div64_u64(contrib, 1400 se->avg.load_avg_contrib = div_u64(contrib,
1364 atomic64_read(&tg->load_avg) + 1); 1401 atomic_long_read(&tg->load_avg) + 1);
1365 1402
1366 /* 1403 /*
1367 * For group entities we need to compute a correction term in the case 1404 * For group entities we need to compute a correction term in the case
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1480 if (!decays && !force_update) 1517 if (!decays && !force_update)
1481 return; 1518 return;
1482 1519
1483 if (atomic64_read(&cfs_rq->removed_load)) { 1520 if (atomic_long_read(&cfs_rq->removed_load)) {
1484 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); 1521 unsigned long removed_load;
1522 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1485 subtract_blocked_load_contrib(cfs_rq, removed_load); 1523 subtract_blocked_load_contrib(cfs_rq, removed_load);
1486 } 1524 }
1487 1525
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1497 1535
1498static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 1536static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1499{ 1537{
1500 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); 1538 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1501 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 1539 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1502} 1540}
1503 1541
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1510 * We track migrations using entity decay_count <= 0, on a wake-up 1548 * We track migrations using entity decay_count <= 0, on a wake-up
1511 * migration we use a negative decay count to track the remote decays 1549 * migration we use a negative decay count to track the remote decays
1512 * accumulated while sleeping. 1550 * accumulated while sleeping.
1551 *
1552 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1553 * are seen by enqueue_entity_load_avg() as a migration with an already
1554 * constructed load_avg_contrib.
1513 */ 1555 */
1514 if (unlikely(se->avg.decay_count <= 0)) { 1556 if (unlikely(se->avg.decay_count <= 0)) {
1515 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; 1557 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1516 if (se->avg.decay_count) { 1558 if (se->avg.decay_count) {
1517 /* 1559 /*
1518 * In a wake-up migration we have to approximate the 1560 * In a wake-up migration we have to approximate the
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1530 } 1572 }
1531 wakeup = 0; 1573 wakeup = 0;
1532 } else { 1574 } else {
1533 __synchronize_entity_decay(se); 1575 /*
1576 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1577 * would have made count negative); we must be careful to avoid
1578 * double-accounting blocked time after synchronizing decays.
1579 */
1580 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 << 20;
1534 } 1582 }
1535 1583
1536 /* migrated tasks did not contribute to our blocked load */ 1584 /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1607 tsk = task_of(se); 1655 tsk = task_of(se);
1608 1656
1609 if (se->statistics.sleep_start) { 1657 if (se->statistics.sleep_start) {
1610 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; 1658 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1611 1659
1612 if ((s64)delta < 0) 1660 if ((s64)delta < 0)
1613 delta = 0; 1661 delta = 0;
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1624 } 1672 }
1625 } 1673 }
1626 if (se->statistics.block_start) { 1674 if (se->statistics.block_start) {
1627 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; 1675 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1628 1676
1629 if ((s64)delta < 0) 1677 if ((s64)delta < 0)
1630 delta = 0; 1678 delta = 0;
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1712{ 1760{
1713 /* 1761 /*
1714 * Update the normalized vruntime before updating min_vruntime 1762 * Update the normalized vruntime before updating min_vruntime
1715 * through callig update_curr(). 1763 * through calling update_curr().
1716 */ 1764 */
1717 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 1765 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1718 se->vruntime += cfs_rq->min_vruntime; 1766 se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1805 struct task_struct *tsk = task_of(se); 1853 struct task_struct *tsk = task_of(se);
1806 1854
1807 if (tsk->state & TASK_INTERRUPTIBLE) 1855 if (tsk->state & TASK_INTERRUPTIBLE)
1808 se->statistics.sleep_start = rq_of(cfs_rq)->clock; 1856 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1809 if (tsk->state & TASK_UNINTERRUPTIBLE) 1857 if (tsk->state & TASK_UNINTERRUPTIBLE)
1810 se->statistics.block_start = rq_of(cfs_rq)->clock; 1858 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1811 } 1859 }
1812#endif 1860#endif
1813 } 1861 }
@@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2082 if (unlikely(cfs_rq->throttle_count)) 2130 if (unlikely(cfs_rq->throttle_count))
2083 return cfs_rq->throttled_clock_task; 2131 return cfs_rq->throttled_clock_task;
2084 2132
2085 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; 2133 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2086} 2134}
2087 2135
2088/* returns 0 on failure to allocate runtime */ 2136/* returns 0 on failure to allocate runtime */
@@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2138static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2186static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139{ 2187{
2140 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 2188 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2141 struct rq *rq = rq_of(cfs_rq);
2142 2189
2143 /* if the deadline is ahead of our clock, nothing to do */ 2190 /* if the deadline is ahead of our clock, nothing to do */
2144 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) 2191 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2145 return; 2192 return;
2146 2193
2147 if (cfs_rq->runtime_remaining < 0) 2194 if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
2230#ifdef CONFIG_SMP 2277#ifdef CONFIG_SMP
2231 if (!cfs_rq->throttle_count) { 2278 if (!cfs_rq->throttle_count) {
2232 /* adjust cfs_rq_clock_task() */ 2279 /* adjust cfs_rq_clock_task() */
2233 cfs_rq->throttled_clock_task_time += rq->clock_task - 2280 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2234 cfs_rq->throttled_clock_task; 2281 cfs_rq->throttled_clock_task;
2235 } 2282 }
2236#endif 2283#endif
@@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
2245 2292
2246 /* group is entering throttled state, stop time */ 2293 /* group is entering throttled state, stop time */
2247 if (!cfs_rq->throttle_count) 2294 if (!cfs_rq->throttle_count)
2248 cfs_rq->throttled_clock_task = rq->clock_task; 2295 cfs_rq->throttled_clock_task = rq_clock_task(rq);
2249 cfs_rq->throttle_count++; 2296 cfs_rq->throttle_count++;
2250 2297
2251 return 0; 2298 return 0;
@@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2284 rq->nr_running -= task_delta; 2331 rq->nr_running -= task_delta;
2285 2332
2286 cfs_rq->throttled = 1; 2333 cfs_rq->throttled = 1;
2287 cfs_rq->throttled_clock = rq->clock; 2334 cfs_rq->throttled_clock = rq_clock(rq);
2288 raw_spin_lock(&cfs_b->lock); 2335 raw_spin_lock(&cfs_b->lock);
2289 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2336 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2290 raw_spin_unlock(&cfs_b->lock); 2337 raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2298 int enqueue = 1; 2345 int enqueue = 1;
2299 long task_delta; 2346 long task_delta;
2300 2347
2301 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2348 se = cfs_rq->tg->se[cpu_of(rq)];
2302 2349
2303 cfs_rq->throttled = 0; 2350 cfs_rq->throttled = 0;
2351
2352 update_rq_clock(rq);
2353
2304 raw_spin_lock(&cfs_b->lock); 2354 raw_spin_lock(&cfs_b->lock);
2305 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; 2355 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2306 list_del_rcu(&cfs_rq->throttled_list); 2356 list_del_rcu(&cfs_rq->throttled_list);
2307 raw_spin_unlock(&cfs_b->lock); 2357 raw_spin_unlock(&cfs_b->lock);
2308 2358
2309 update_rq_clock(rq);
2310 /* update hierarchical throttle state */ 2359 /* update hierarchical throttle state */
2311 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 2360 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2312 2361
@@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2599 throttle_cfs_rq(cfs_rq); 2648 throttle_cfs_rq(cfs_rq);
2600} 2649}
2601 2650
2602static inline u64 default_cfs_period(void);
2603static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2604static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2605
2606static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 2651static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2607{ 2652{
2608 struct cfs_bandwidth *cfs_b = 2653 struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2706#else /* CONFIG_CFS_BANDWIDTH */ 2751#else /* CONFIG_CFS_BANDWIDTH */
2707static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 2752static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2708{ 2753{
2709 return rq_of(cfs_rq)->clock_task; 2754 return rq_clock_task(rq_of(cfs_rq));
2710} 2755}
2711 2756
2712static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2757static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2919/* Used instead of source_load when we know the type == 0 */ 2964/* Used instead of source_load when we know the type == 0 */
2920static unsigned long weighted_cpuload(const int cpu) 2965static unsigned long weighted_cpuload(const int cpu)
2921{ 2966{
2922 return cpu_rq(cpu)->load.weight; 2967 return cpu_rq(cpu)->cfs.runnable_load_avg;
2923} 2968}
2924 2969
2925/* 2970/*
@@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)
2964{ 3009{
2965 struct rq *rq = cpu_rq(cpu); 3010 struct rq *rq = cpu_rq(cpu);
2966 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 3011 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3012 unsigned long load_avg = rq->cfs.runnable_load_avg;
2967 3013
2968 if (nr_running) 3014 if (nr_running)
2969 return rq->load.weight / nr_running; 3015 return load_avg / nr_running;
2970 3016
2971 return 0; 3017 return 0;
2972} 3018}
@@ -3416,12 +3462,6 @@ unlock:
3416} 3462}
3417 3463
3418/* 3464/*
3419 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3420 * removed when useful for applications beyond shares distribution (e.g.
3421 * load-balance).
3422 */
3423#ifdef CONFIG_FAIR_GROUP_SCHED
3424/*
3425 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 3465 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3426 * cfs_rq_of(p) references at time of call are still valid and identify the 3466 * cfs_rq_of(p) references at time of call are still valid and identify the
3427 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 3467 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3441 */ 3481 */
3442 if (se->avg.decay_count) { 3482 if (se->avg.decay_count) {
3443 se->avg.decay_count = -__synchronize_entity_decay(se); 3483 se->avg.decay_count = -__synchronize_entity_decay(se);
3444 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); 3484 atomic_long_add(se->avg.load_avg_contrib,
3485 &cfs_rq->removed_load);
3445 } 3486 }
3446} 3487}
3447#endif
3448#endif /* CONFIG_SMP */ 3488#endif /* CONFIG_SMP */
3449 3489
3450static unsigned long 3490static unsigned long
@@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3946 * 2) too many balance attempts have failed. 3986 * 2) too many balance attempts have failed.
3947 */ 3987 */
3948 3988
3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3989 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
3950 if (!tsk_cache_hot || 3990 if (!tsk_cache_hot ||
3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3991 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3952 3992
@@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data)
4141 long cpu = (long)data; 4181 long cpu = (long)data;
4142 4182
4143 if (!tg->parent) { 4183 if (!tg->parent) {
4144 load = cpu_rq(cpu)->load.weight; 4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4145 } else { 4185 } else {
4146 load = tg->parent->cfs_rq[cpu]->h_load; 4186 load = tg->parent->cfs_rq[cpu]->h_load;
4147 load *= tg->se[cpu]->load.weight; 4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4148 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4149 } 4189 }
4150 4190
4151 tg->cfs_rq[cpu]->h_load = load; 4191 tg->cfs_rq[cpu]->h_load = load;
@@ -4171,12 +4211,9 @@ static void update_h_load(long cpu)
4171static unsigned long task_h_load(struct task_struct *p) 4211static unsigned long task_h_load(struct task_struct *p)
4172{ 4212{
4173 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4213 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4174 unsigned long load;
4175
4176 load = p->se.load.weight;
4177 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4178 4214
4179 return load; 4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1);
4180} 4217}
4181#else 4218#else
4182static inline void update_blocked_averages(int cpu) 4219static inline void update_blocked_averages(int cpu)
@@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu)
4189 4226
4190static unsigned long task_h_load(struct task_struct *p) 4227static unsigned long task_h_load(struct task_struct *p)
4191{ 4228{
4192 return p->se.load.weight; 4229 return p->se.avg.load_avg_contrib;
4193} 4230}
4194#endif 4231#endif
4195 4232
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu)
4302 age_stamp = ACCESS_ONCE(rq->age_stamp); 4339 age_stamp = ACCESS_ONCE(rq->age_stamp);
4303 avg = ACCESS_ONCE(rq->rt_avg); 4340 avg = ACCESS_ONCE(rq->rt_avg);
4304 4341
4305 total = sched_avg_period() + (rq->clock - age_stamp); 4342 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4306 4343
4307 if (unlikely(total < avg)) { 4344 if (unlikely(total < avg)) {
4308 /* Ensures that power won't end up being negative */ 4345 /* Ensures that power won't end up being negative */
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5241 int pulled_task = 0; 5278 int pulled_task = 0;
5242 unsigned long next_balance = jiffies + HZ; 5279 unsigned long next_balance = jiffies + HZ;
5243 5280
5244 this_rq->idle_stamp = this_rq->clock; 5281 this_rq->idle_stamp = rq_clock(this_rq);
5245 5282
5246 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5283 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5247 return; 5284 return;
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu)
5418static inline void set_cpu_sd_state_busy(void) 5455static inline void set_cpu_sd_state_busy(void)
5419{ 5456{
5420 struct sched_domain *sd; 5457 struct sched_domain *sd;
5421 int cpu = smp_processor_id();
5422 5458
5423 rcu_read_lock(); 5459 rcu_read_lock();
5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5460 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5425 5461
5426 if (!sd || !sd->nohz_idle) 5462 if (!sd || !sd->nohz_idle)
5427 goto unlock; 5463 goto unlock;
@@ -5436,10 +5472,9 @@ unlock:
5436void set_cpu_sd_state_idle(void) 5472void set_cpu_sd_state_idle(void)
5437{ 5473{
5438 struct sched_domain *sd; 5474 struct sched_domain *sd;
5439 int cpu = smp_processor_id();
5440 5475
5441 rcu_read_lock(); 5476 rcu_read_lock();
5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5477 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5443 5478
5444 if (!sd || sd->nohz_idle) 5479 if (!sd || sd->nohz_idle)
5445 goto unlock; 5480 goto unlock;
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5848 se->vruntime -= cfs_rq->min_vruntime; 5883 se->vruntime -= cfs_rq->min_vruntime;
5849 } 5884 }
5850 5885
5851#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5886#ifdef CONFIG_SMP
5852 /* 5887 /*
5853 * Remove our load from contribution when we leave sched_fair 5888 * Remove our load from contribution when we leave sched_fair
5854 * and ensure we don't carry in an old decay_count if we 5889 * and ensure we don't carry in an old decay_count if we
@@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5907#ifndef CONFIG_64BIT 5942#ifndef CONFIG_64BIT
5908 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5943 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5909#endif 5944#endif
5910#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5945#ifdef CONFIG_SMP
5911 atomic64_set(&cfs_rq->decay_counter, 1); 5946 atomic64_set(&cfs_rq->decay_counter, 1);
5912 atomic64_set(&cfs_rq->removed_load, 0); 5947 atomic_long_set(&cfs_rq->removed_load, 0);
5913#endif 5948#endif
5914} 5949}
5915 5950
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6091 se = tg->se[i]; 6126 se = tg->se[i];
6092 /* Propagate contribution to hierarchy */ 6127 /* Propagate contribution to hierarchy */
6093 raw_spin_lock_irqsave(&rq->lock, flags); 6128 raw_spin_lock_irqsave(&rq->lock, flags);
6129
6130 /* Possible calls to update_curr() need rq clock */
6131 update_rq_clock(rq);
6094 for_each_sched_entity(se) 6132 for_each_sched_entity(se)
6095 update_cfs_shares(group_cfs_rq(se)); 6133 update_cfs_shares(group_cfs_rq(se));
6096 raw_spin_unlock_irqrestore(&rq->lock, flags); 6134 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = {
6146 6184
6147#ifdef CONFIG_SMP 6185#ifdef CONFIG_SMP
6148 .select_task_rq = select_task_rq_fair, 6186 .select_task_rq = select_task_rq_fair,
6149#ifdef CONFIG_FAIR_GROUP_SCHED
6150 .migrate_task_rq = migrate_task_rq_fair, 6187 .migrate_task_rq = migrate_task_rq_fair,
6151#endif 6188
6152 .rq_online = rq_online_fair, 6189 .rq_online = rq_online_fair,
6153 .rq_offline = rq_offline_fair, 6190 .rq_offline = rq_offline_fair,
6154 6191
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000000..16f5a30f9c88
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
1/*
2 * kernel/sched/proc.c
3 *
4 * Kernel load calculations, forked from sched/core.c
5 */
6
7#include <linux/export.h>
8
9#include "sched.h"
10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/*
19 * Global load-average calculations
20 *
21 * We take a distributed and async approach to calculating the global load-avg
22 * in order to minimize overhead.
23 *
24 * The global load average is an exponentially decaying average of nr_running +
25 * nr_uninterruptible.
26 *
27 * Once every LOAD_FREQ:
28 *
29 * nr_active = 0;
30 * for_each_possible_cpu(cpu)
31 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
32 *
33 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
34 *
35 * Due to a number of reasons the above turns in the mess below:
36 *
37 * - for_each_possible_cpu() is prohibitively expensive on machines with
38 * serious number of cpus, therefore we need to take a distributed approach
39 * to calculating nr_active.
40 *
41 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
42 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
43 *
44 * So assuming nr_active := 0 when we start out -- true per definition, we
45 * can simply take per-cpu deltas and fold those into a global accumulate
46 * to obtain the same result. See calc_load_fold_active().
47 *
48 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
49 * across the machine, we assume 10 ticks is sufficient time for every
50 * cpu to have completed this task.
51 *
52 * This places an upper-bound on the IRQ-off latency of the machine. Then
53 * again, being late doesn't loose the delta, just wrecks the sample.
54 *
55 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
56 * this would add another cross-cpu cacheline miss and atomic operation
57 * to the wakeup path. Instead we increment on whatever cpu the task ran
58 * when it went into uninterruptible state and decrement on whatever cpu
59 * did the wakeup. This means that only the sum of nr_uninterruptible over
60 * all cpus yields the correct result.
61 *
62 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
63 */
64
65/* Variables and functions for calc_load */
66atomic_long_t calc_load_tasks;
67unsigned long calc_load_update;
68unsigned long avenrun[3];
69EXPORT_SYMBOL(avenrun); /* should be removed */
70
71/**
72 * get_avenrun - get the load average array
73 * @loads: pointer to dest load array
74 * @offset: offset to add
75 * @shift: shift count to shift the result left
76 *
77 * These values are estimates at best, so no need for locking.
78 */
79void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
80{
81 loads[0] = (avenrun[0] + offset) << shift;
82 loads[1] = (avenrun[1] + offset) << shift;
83 loads[2] = (avenrun[2] + offset) << shift;
84}
85
86long calc_load_fold_active(struct rq *this_rq)
87{
88 long nr_active, delta = 0;
89
90 nr_active = this_rq->nr_running;
91 nr_active += (long) this_rq->nr_uninterruptible;
92
93 if (nr_active != this_rq->calc_load_active) {
94 delta = nr_active - this_rq->calc_load_active;
95 this_rq->calc_load_active = nr_active;
96 }
97
98 return delta;
99}
100
101/*
102 * a1 = a0 * e + a * (1 - e)
103 */
104static unsigned long
105calc_load(unsigned long load, unsigned long exp, unsigned long active)
106{
107 load *= exp;
108 load += active * (FIXED_1 - exp);
109 load += 1UL << (FSHIFT - 1);
110 return load >> FSHIFT;
111}
112
113#ifdef CONFIG_NO_HZ_COMMON
114/*
115 * Handle NO_HZ for the global load-average.
116 *
117 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by
119 * NO_HZ.
120 *
121 * The basic idea is to fold the nr_active delta into a global idle-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
123 * when we read the global state.
124 *
125 * Obviously reality has to ruin such a delightfully simple scheme:
126 *
127 * - When we go NO_HZ idle during the window, we can negate our sample
128 * contribution, causing under-accounting.
129 *
130 * We avoid this by keeping two idle-delta counters and flipping them
131 * when the window starts, thus separating old and new NO_HZ load.
132 *
133 * The only trick is the slight shift in index flip for read vs write.
134 *
135 * 0s 5s 10s 15s
136 * +10 +10 +10 +10
137 * |-|-----------|-|-----------|-|-----------|-|
138 * r:0 0 1 1 0 0 1 1 0
139 * w:0 1 1 0 0 1 1 0 0
140 *
141 * This ensures we'll fold the old idle contribution in this window while
142 * accumlating the new one.
143 *
144 * - When we wake up from NO_HZ idle during the window, we push up our
145 * contribution, since we effectively move our sample point to a known
146 * busy state.
147 *
148 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the idle-delta for this cpu which
150 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NOHZ idle for multiple
152 * LOAD_FREQ intervals.
153 *
154 * When making the ILB scale, we should try to pull this in as well.
155 */
156static atomic_long_t calc_load_idle[2];
157static int calc_load_idx;
158
159static inline int calc_load_write_idx(void)
160{
161 int idx = calc_load_idx;
162
163 /*
164 * See calc_global_nohz(), if we observe the new index, we also
165 * need to observe the new update time.
166 */
167 smp_rmb();
168
169 /*
170 * If the folding window started, make sure we start writing in the
171 * next idle-delta.
172 */
173 if (!time_before(jiffies, calc_load_update))
174 idx++;
175
176 return idx & 1;
177}
178
179static inline int calc_load_read_idx(void)
180{
181 return calc_load_idx & 1;
182}
183
184void calc_load_enter_idle(void)
185{
186 struct rq *this_rq = this_rq();
187 long delta;
188
189 /*
190 * We're going into NOHZ mode, if there's any pending delta, fold it
191 * into the pending idle delta.
192 */
193 delta = calc_load_fold_active(this_rq);
194 if (delta) {
195 int idx = calc_load_write_idx();
196 atomic_long_add(delta, &calc_load_idle[idx]);
197 }
198}
199
200void calc_load_exit_idle(void)
201{
202 struct rq *this_rq = this_rq();
203
204 /*
205 * If we're still before the sample window, we're done.
206 */
207 if (time_before(jiffies, this_rq->calc_load_update))
208 return;
209
210 /*
211 * We woke inside or after the sample window, this means we're already
212 * accounted through the nohz accounting, so skip the entire deal and
213 * sync up for the next window.
214 */
215 this_rq->calc_load_update = calc_load_update;
216 if (time_before(jiffies, this_rq->calc_load_update + 10))
217 this_rq->calc_load_update += LOAD_FREQ;
218}
219
220static long calc_load_fold_idle(void)
221{
222 int idx = calc_load_read_idx();
223 long delta = 0;
224
225 if (atomic_long_read(&calc_load_idle[idx]))
226 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
227
228 return delta;
229}
230
231/**
232 * fixed_power_int - compute: x^n, in O(log n) time
233 *
234 * @x: base of the power
235 * @frac_bits: fractional bits of @x
236 * @n: power to raise @x to.
237 *
238 * By exploiting the relation between the definition of the natural power
239 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
240 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
241 * (where: n_i \elem {0, 1}, the binary vector representing n),
242 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
243 * of course trivially computable in O(log_2 n), the length of our binary
244 * vector.
245 */
246static unsigned long
247fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
248{
249 unsigned long result = 1UL << frac_bits;
250
251 if (n) for (;;) {
252 if (n & 1) {
253 result *= x;
254 result += 1UL << (frac_bits - 1);
255 result >>= frac_bits;
256 }
257 n >>= 1;
258 if (!n)
259 break;
260 x *= x;
261 x += 1UL << (frac_bits - 1);
262 x >>= frac_bits;
263 }
264
265 return result;
266}
267
268/*
269 * a1 = a0 * e + a * (1 - e)
270 *
271 * a2 = a1 * e + a * (1 - e)
272 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
273 * = a0 * e^2 + a * (1 - e) * (1 + e)
274 *
275 * a3 = a2 * e + a * (1 - e)
276 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
277 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
278 *
279 * ...
280 *
281 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
282 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
283 * = a0 * e^n + a * (1 - e^n)
284 *
285 * [1] application of the geometric series:
286 *
287 * n 1 - x^(n+1)
288 * S_n := \Sum x^i = -------------
289 * i=0 1 - x
290 */
291static unsigned long
292calc_load_n(unsigned long load, unsigned long exp,
293 unsigned long active, unsigned int n)
294{
295
296 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
297}
298
299/*
300 * NO_HZ can leave us missing all per-cpu ticks calling
301 * calc_load_account_active(), but since an idle CPU folds its delta into
302 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
303 * in the pending idle delta if our idle period crossed a load cycle boundary.
304 *
305 * Once we've updated the global active value, we need to apply the exponential
306 * weights adjusted to the number of cycles missed.
307 */
308static void calc_global_nohz(void)
309{
310 long delta, active, n;
311
312 if (!time_before(jiffies, calc_load_update + 10)) {
313 /*
314 * Catch-up, fold however many we are behind still
315 */
316 delta = jiffies - calc_load_update - 10;
317 n = 1 + (delta / LOAD_FREQ);
318
319 active = atomic_long_read(&calc_load_tasks);
320 active = active > 0 ? active * FIXED_1 : 0;
321
322 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
323 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
324 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
325
326 calc_load_update += n * LOAD_FREQ;
327 }
328
329 /*
330 * Flip the idle index...
331 *
332 * Make sure we first write the new time then flip the index, so that
333 * calc_load_write_idx() will see the new time when it reads the new
334 * index, this avoids a double flip messing things up.
335 */
336 smp_wmb();
337 calc_load_idx++;
338}
339#else /* !CONFIG_NO_HZ_COMMON */
340
341static inline long calc_load_fold_idle(void) { return 0; }
342static inline void calc_global_nohz(void) { }
343
344#endif /* CONFIG_NO_HZ_COMMON */
345
346/*
347 * calc_load - update the avenrun load estimates 10 ticks after the
348 * CPUs have updated calc_load_tasks.
349 */
350void calc_global_load(unsigned long ticks)
351{
352 long active, delta;
353
354 if (time_before(jiffies, calc_load_update + 10))
355 return;
356
357 /*
358 * Fold the 'old' idle-delta to include all NO_HZ cpus.
359 */
360 delta = calc_load_fold_idle();
361 if (delta)
362 atomic_long_add(delta, &calc_load_tasks);
363
364 active = atomic_long_read(&calc_load_tasks);
365 active = active > 0 ? active * FIXED_1 : 0;
366
367 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
368 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
369 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
370
371 calc_load_update += LOAD_FREQ;
372
373 /*
374 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
375 */
376 calc_global_nohz();
377}
378
379/*
380 * Called from update_cpu_load() to periodically update this CPU's
381 * active count.
382 */
383static void calc_load_account_active(struct rq *this_rq)
384{
385 long delta;
386
387 if (time_before(jiffies, this_rq->calc_load_update))
388 return;
389
390 delta = calc_load_fold_active(this_rq);
391 if (delta)
392 atomic_long_add(delta, &calc_load_tasks);
393
394 this_rq->calc_load_update += LOAD_FREQ;
395}
396
397/*
398 * End of global load-average stuff
399 */
400
401/*
402 * The exact cpuload at various idx values, calculated at every tick would be
403 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
404 *
405 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
406 * on nth tick when cpu may be busy, then we have:
407 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
408 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
409 *
410 * decay_load_missed() below does efficient calculation of
411 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
412 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
413 *
414 * The calculation is approximated on a 128 point scale.
415 * degrade_zero_ticks is the number of ticks after which load at any
416 * particular idx is approximated to be zero.
417 * degrade_factor is a precomputed table, a row for each load idx.
418 * Each column corresponds to degradation factor for a power of two ticks,
419 * based on 128 point scale.
420 * Example:
421 * row 2, col 3 (=12) says that the degradation at load idx 2 after
422 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
423 *
424 * With this power of 2 load factors, we can degrade the load n times
425 * by looking at 1 bits in n and doing as many mult/shift instead of
426 * n mult/shifts needed by the exact degradation.
427 */
428#define DEGRADE_SHIFT 7
429static const unsigned char
430 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
431static const unsigned char
432 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
433 {0, 0, 0, 0, 0, 0, 0, 0},
434 {64, 32, 8, 0, 0, 0, 0, 0},
435 {96, 72, 40, 12, 1, 0, 0},
436 {112, 98, 75, 43, 15, 1, 0},
437 {120, 112, 98, 76, 45, 16, 2} };
438
439/*
440 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
441 * would be when CPU is idle and so we just decay the old load without
442 * adding any new load.
443 */
444static unsigned long
445decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
446{
447 int j = 0;
448
449 if (!missed_updates)
450 return load;
451
452 if (missed_updates >= degrade_zero_ticks[idx])
453 return 0;
454
455 if (idx == 1)
456 return load >> missed_updates;
457
458 while (missed_updates) {
459 if (missed_updates % 2)
460 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
461
462 missed_updates >>= 1;
463 j++;
464 }
465 return load;
466}
467
468/*
469 * Update rq->cpu_load[] statistics. This function is usually called every
470 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
471 * every tick. We fix it up based on jiffies.
472 */
473static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
474 unsigned long pending_updates)
475{
476 int i, scale;
477
478 this_rq->nr_load_updates++;
479
480 /* Update our load: */
481 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
482 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
483 unsigned long old_load, new_load;
484
485 /* scale is effectively 1 << i now, and >> i divides by scale */
486
487 old_load = this_rq->cpu_load[i];
488 old_load = decay_load_missed(old_load, pending_updates - 1, i);
489 new_load = this_load;
490 /*
491 * Round up the averaging division if load is increasing. This
492 * prevents us from getting stuck on 9 if the load is 10, for
493 * example.
494 */
495 if (new_load > old_load)
496 new_load += scale - 1;
497
498 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
499 }
500
501 sched_avg_update(this_rq);
502}
503
504#ifdef CONFIG_SMP
505static inline unsigned long get_rq_runnable_load(struct rq *rq)
506{
507 return rq->cfs.runnable_load_avg;
508}
509#else
510static inline unsigned long get_rq_runnable_load(struct rq *rq)
511{
512 return rq->load.weight;
513}
514#endif
515
516#ifdef CONFIG_NO_HZ_COMMON
517/*
518 * There is no sane way to deal with nohz on smp when using jiffies because the
519 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
520 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
521 *
522 * Therefore we cannot use the delta approach from the regular tick since that
523 * would seriously skew the load calculation. However we'll make do for those
524 * updates happening while idle (nohz_idle_balance) or coming out of idle
525 * (tick_nohz_idle_exit).
526 *
527 * This means we might still be one tick off for nohz periods.
528 */
529
530/*
531 * Called from nohz_idle_balance() to update the load ratings before doing the
532 * idle balance.
533 */
534void update_idle_cpu_load(struct rq *this_rq)
535{
536 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
537 unsigned long load = get_rq_runnable_load(this_rq);
538 unsigned long pending_updates;
539
540 /*
541 * bail if there's load or we're actually up-to-date.
542 */
543 if (load || curr_jiffies == this_rq->last_load_update_tick)
544 return;
545
546 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
547 this_rq->last_load_update_tick = curr_jiffies;
548
549 __update_cpu_load(this_rq, load, pending_updates);
550}
551
552/*
553 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
554 */
555void update_cpu_load_nohz(void)
556{
557 struct rq *this_rq = this_rq();
558 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
559 unsigned long pending_updates;
560
561 if (curr_jiffies == this_rq->last_load_update_tick)
562 return;
563
564 raw_spin_lock(&this_rq->lock);
565 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
566 if (pending_updates) {
567 this_rq->last_load_update_tick = curr_jiffies;
568 /*
569 * We were idle, this means load 0, the current load might be
570 * !0 due to remote wakeups and the sort.
571 */
572 __update_cpu_load(this_rq, 0, pending_updates);
573 }
574 raw_spin_unlock(&this_rq->lock);
575}
576#endif /* CONFIG_NO_HZ */
577
578/*
579 * Called from scheduler_tick()
580 */
581void update_cpu_load_active(struct rq *this_rq)
582{
583 unsigned long load = get_rq_runnable_load(this_rq);
584 /*
585 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
586 */
587 this_rq->last_load_update_tick = jiffies;
588 __update_cpu_load(this_rq, load, 1);
589
590 calc_load_account_active(this_rq);
591}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4ab..01970c8e64df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
399 (iter = next_task_group(iter)) && \ 399 (iter = next_task_group(iter)) && \
400 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 400 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
401 401
402static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
403{
404 list_add_rcu(&rt_rq->leaf_rt_rq_list,
405 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
406}
407
408static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
409{
410 list_del_rcu(&rt_rq->leaf_rt_rq_list);
411}
412
413#define for_each_leaf_rt_rq(rt_rq, rq) \
414 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
415
416#define for_each_sched_rt_entity(rt_se) \ 402#define for_each_sched_rt_entity(rt_se) \
417 for (; rt_se; rt_se = rt_se->parent) 403 for (; rt_se; rt_se = rt_se->parent)
418 404
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
472#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
473static inline const struct cpumask *sched_rt_period_mask(void) 459static inline const struct cpumask *sched_rt_period_mask(void)
474{ 460{
475 return cpu_rq(smp_processor_id())->rd->span; 461 return this_rq()->rd->span;
476} 462}
477#else 463#else
478static inline const struct cpumask *sched_rt_period_mask(void) 464static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;
509#define for_each_rt_rq(rt_rq, iter, rq) \ 495#define for_each_rt_rq(rt_rq, iter, rq) \
510 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 496 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
511 497
512static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
513{
514}
515
516static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
517{
518}
519
520#define for_each_leaf_rt_rq(rt_rq, rq) \
521 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
522
523#define for_each_sched_rt_entity(rt_se) \ 498#define for_each_sched_rt_entity(rt_se) \
524 for (; rt_se; rt_se = NULL) 499 for (; rt_se; rt_se = NULL)
525 500
@@ -699,15 +674,6 @@ balanced:
699 } 674 }
700} 675}
701 676
702static void disable_runtime(struct rq *rq)
703{
704 unsigned long flags;
705
706 raw_spin_lock_irqsave(&rq->lock, flags);
707 __disable_runtime(rq);
708 raw_spin_unlock_irqrestore(&rq->lock, flags);
709}
710
711static void __enable_runtime(struct rq *rq) 677static void __enable_runtime(struct rq *rq)
712{ 678{
713 rt_rq_iter_t iter; 679 rt_rq_iter_t iter;
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)
732 } 698 }
733} 699}
734 700
735static void enable_runtime(struct rq *rq)
736{
737 unsigned long flags;
738
739 raw_spin_lock_irqsave(&rq->lock, flags);
740 __enable_runtime(rq);
741 raw_spin_unlock_irqrestore(&rq->lock, flags);
742}
743
744int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
745{
746 int cpu = (int)(long)hcpu;
747
748 switch (action) {
749 case CPU_DOWN_PREPARE:
750 case CPU_DOWN_PREPARE_FROZEN:
751 disable_runtime(cpu_rq(cpu));
752 return NOTIFY_OK;
753
754 case CPU_DOWN_FAILED:
755 case CPU_DOWN_FAILED_FROZEN:
756 case CPU_ONLINE:
757 case CPU_ONLINE_FROZEN:
758 enable_runtime(cpu_rq(cpu));
759 return NOTIFY_OK;
760
761 default:
762 return NOTIFY_DONE;
763 }
764}
765
766static int balance_runtime(struct rt_rq *rt_rq) 701static int balance_runtime(struct rt_rq *rt_rq)
767{ 702{
768 int more = 0; 703 int more = 0;
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)
926 if (curr->sched_class != &rt_sched_class) 861 if (curr->sched_class != &rt_sched_class)
927 return; 862 return;
928 863
929 delta_exec = rq->clock_task - curr->se.exec_start; 864 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
930 if (unlikely((s64)delta_exec <= 0)) 865 if (unlikely((s64)delta_exec <= 0))
931 return; 866 return;
932 867
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)
936 curr->se.sum_exec_runtime += delta_exec; 871 curr->se.sum_exec_runtime += delta_exec;
937 account_group_exec_runtime(curr, delta_exec); 872 account_group_exec_runtime(curr, delta_exec);
938 873
939 curr->se.exec_start = rq->clock_task; 874 curr->se.exec_start = rq_clock_task(rq);
940 cpuacct_charge(curr, delta_exec); 875 cpuacct_charge(curr, delta_exec);
941 876
942 sched_rt_avg_update(rq, delta_exec); 877 sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1106 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1041 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1107 return; 1042 return;
1108 1043
1109 if (!rt_rq->rt_nr_running)
1110 list_add_leaf_rt_rq(rt_rq);
1111
1112 if (head) 1044 if (head)
1113 list_add(&rt_se->run_list, queue); 1045 list_add(&rt_se->run_list, queue);
1114 else 1046 else
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1128 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1060 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1129 1061
1130 dec_rt_tasks(rt_se, rt_rq); 1062 dec_rt_tasks(rt_se, rt_rq);
1131 if (!rt_rq->rt_nr_running)
1132 list_del_leaf_rt_rq(rt_rq);
1133} 1063}
1134 1064
1135/* 1065/*
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1385 } while (rt_rq); 1315 } while (rt_rq);
1386 1316
1387 p = rt_task_of(rt_se); 1317 p = rt_task_of(rt_se);
1388 p->se.exec_start = rq->clock_task; 1318 p->se.exec_start = rq_clock_task(rq);
1389 1319
1390 return p; 1320 return p;
1391} 1321}
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1434 return 0; 1364 return 0;
1435} 1365}
1436 1366
1437/* Return the second highest RT task, NULL otherwise */ 1367/*
1438static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1368 * Return the highest pushable rq's task, which is suitable to be executed
1369 * on the cpu, NULL otherwise
1370 */
1371static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1439{ 1372{
1440 struct task_struct *next = NULL; 1373 struct plist_head *head = &rq->rt.pushable_tasks;
1441 struct sched_rt_entity *rt_se; 1374 struct task_struct *p;
1442 struct rt_prio_array *array;
1443 struct rt_rq *rt_rq;
1444 int idx;
1445
1446 for_each_leaf_rt_rq(rt_rq, rq) {
1447 array = &rt_rq->active;
1448 idx = sched_find_first_bit(array->bitmap);
1449next_idx:
1450 if (idx >= MAX_RT_PRIO)
1451 continue;
1452 if (next && next->prio <= idx)
1453 continue;
1454 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1455 struct task_struct *p;
1456 1375
1457 if (!rt_entity_is_task(rt_se)) 1376 if (!has_pushable_tasks(rq))
1458 continue; 1377 return NULL;
1459 1378
1460 p = rt_task_of(rt_se); 1379 plist_for_each_entry(p, head, pushable_tasks) {
1461 if (pick_rt_task(rq, p, cpu)) { 1380 if (pick_rt_task(rq, p, cpu))
1462 next = p; 1381 return p;
1463 break;
1464 }
1465 }
1466 if (!next) {
1467 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1468 goto next_idx;
1469 }
1470 } 1382 }
1471 1383
1472 return next; 1384 return NULL;
1473} 1385}
1474 1386
1475static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1387static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)
1743 double_lock_balance(this_rq, src_rq); 1655 double_lock_balance(this_rq, src_rq);
1744 1656
1745 /* 1657 /*
1746 * Are there still pullable RT tasks? 1658 * We can pull only a task, which is pushable
1659 * on its rq, and no others.
1747 */ 1660 */
1748 if (src_rq->rt.rt_nr_running <= 1) 1661 p = pick_highest_pushable_task(src_rq, this_cpu);
1749 goto skip;
1750
1751 p = pick_next_highest_task_rt(src_rq, this_cpu);
1752 1662
1753 /* 1663 /*
1754 * Do we have an RT task that preempts 1664 * Do we have an RT task that preempts
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)
2037{ 1947{
2038 struct task_struct *p = rq->curr; 1948 struct task_struct *p = rq->curr;
2039 1949
2040 p->se.exec_start = rq->clock_task; 1950 p->se.exec_start = rq_clock_task(rq);
2041 1951
2042 /* The running task is never eligible for pushing */ 1952 /* The running task is never eligible for pushing */
2043 dequeue_pushable_task(rq, p); 1953 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d6155..ef0a7b2439dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,8 +10,16 @@
10#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h" 11#include "cpuacct.h"
12 12
13struct rq;
14
13extern __read_mostly int scheduler_running; 15extern __read_mostly int scheduler_running;
14 16
17extern unsigned long calc_load_update;
18extern atomic_long_t calc_load_tasks;
19
20extern long calc_load_fold_active(struct rq *this_rq);
21extern void update_cpu_load_active(struct rq *this_rq);
22
15/* 23/*
16 * Convert user-nice values [ -20 ... 0 ... 19 ] 24 * Convert user-nice values [ -20 ... 0 ... 19 ]
17 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 25 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +148,11 @@ struct task_group {
140 struct cfs_rq **cfs_rq; 148 struct cfs_rq **cfs_rq;
141 unsigned long shares; 149 unsigned long shares;
142 150
143 atomic_t load_weight; 151#ifdef CONFIG_SMP
144 atomic64_t load_avg; 152 atomic_long_t load_avg;
145 atomic_t runnable_avg; 153 atomic_t runnable_avg;
146#endif 154#endif
155#endif
147 156
148#ifdef CONFIG_RT_GROUP_SCHED 157#ifdef CONFIG_RT_GROUP_SCHED
149 struct sched_rt_entity **rt_se; 158 struct sched_rt_entity **rt_se;
@@ -261,26 +270,21 @@ struct cfs_rq {
261#endif 270#endif
262 271
263#ifdef CONFIG_SMP 272#ifdef CONFIG_SMP
264/*
265 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
266 * removed when useful for applications beyond shares distribution (e.g.
267 * load-balance).
268 */
269#ifdef CONFIG_FAIR_GROUP_SCHED
270 /* 273 /*
271 * CFS Load tracking 274 * CFS Load tracking
272 * Under CFS, load is tracked on a per-entity basis and aggregated up. 275 * Under CFS, load is tracked on a per-entity basis and aggregated up.
273 * This allows for the description of both thread and group usage (in 276 * This allows for the description of both thread and group usage (in
274 * the FAIR_GROUP_SCHED case). 277 * the FAIR_GROUP_SCHED case).
275 */ 278 */
276 u64 runnable_load_avg, blocked_load_avg; 279 unsigned long runnable_load_avg, blocked_load_avg;
277 atomic64_t decay_counter, removed_load; 280 atomic64_t decay_counter;
278 u64 last_decay; 281 u64 last_decay;
279#endif /* CONFIG_FAIR_GROUP_SCHED */ 282 atomic_long_t removed_load;
280/* These always depend on CONFIG_FAIR_GROUP_SCHED */ 283
281#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
285 /* Required to track per-cpu representation of a task_group */
282 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
283 u64 tg_load_contrib; 287 unsigned long tg_load_contrib;
284#endif /* CONFIG_FAIR_GROUP_SCHED */ 288#endif /* CONFIG_FAIR_GROUP_SCHED */
285 289
286 /* 290 /*
@@ -353,7 +357,6 @@ struct rt_rq {
353 unsigned long rt_nr_boosted; 357 unsigned long rt_nr_boosted;
354 358
355 struct rq *rq; 359 struct rq *rq;
356 struct list_head leaf_rt_rq_list;
357 struct task_group *tg; 360 struct task_group *tg;
358#endif 361#endif
359}; 362};
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);
540#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 543#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
541#define raw_rq() (&__raw_get_cpu_var(runqueues)) 544#define raw_rq() (&__raw_get_cpu_var(runqueues))
542 545
546static inline u64 rq_clock(struct rq *rq)
547{
548 return rq->clock;
549}
550
551static inline u64 rq_clock_task(struct rq *rq)
552{
553 return rq->clock_task;
554}
555
543#ifdef CONFIG_SMP 556#ifdef CONFIG_SMP
544 557
545#define rcu_dereference_check_sched_domain(p) \ 558#define rcu_dereference_check_sched_domain(p) \
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
884#define WF_FORK 0x02 /* child wakeup after fork */ 897#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 898#define WF_MIGRATED 0x4 /* internal use, task got migrated */
886 899
887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
888{
889 lw->weight += inc;
890 lw->inv_weight = 0;
891}
892
893static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
894{
895 lw->weight -= dec;
896 lw->inv_weight = 0;
897}
898
899static inline void update_load_set(struct load_weight *lw, unsigned long w)
900{
901 lw->weight = w;
902 lw->inv_weight = 0;
903}
904
905/* 900/*
906 * To aid in avoiding the subversion of "niceness" due to uneven distribution 901 * To aid in avoiding the subversion of "niceness" due to uneven distribution
907 * of tasks with abnormal "nice" values across CPUs the contribution that 902 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1028extern void trigger_load_balance(struct rq *rq, int cpu); 1023extern void trigger_load_balance(struct rq *rq, int cpu);
1029extern void idle_balance(int this_cpu, struct rq *this_rq); 1024extern void idle_balance(int this_cpu, struct rq *this_rq);
1030 1025
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq); 1026extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq); 1027extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042 1028
1043#else /* CONFIG_SMP */ 1029#else /* CONFIG_SMP */
1044 1030
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
1051extern void sysrq_sched_debug_show(void); 1037extern void sysrq_sched_debug_show(void);
1052extern void sched_init_granularity(void); 1038extern void sched_init_granularity(void);
1053extern void update_max_interval(void); 1039extern void update_max_interval(void);
1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
1055extern void init_sched_rt_class(void); 1040extern void init_sched_rt_class(void);
1056extern void init_sched_fair_class(void); 1041extern void init_sched_fair_class(void);
1057 1042
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1063 1048
1064extern void update_idle_cpu_load(struct rq *this_rq); 1049extern void update_idle_cpu_load(struct rq *this_rq);
1065 1050
1051extern void init_task_runnable_average(struct task_struct *p);
1052
1066#ifdef CONFIG_PARAVIRT 1053#ifdef CONFIG_PARAVIRT
1067static inline u64 steal_ticks(u64 steal) 1054static inline u64 steal_ticks(u64 steal)
1068{ 1055{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..17d7065c3872 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct task_struct *t)
63{ 63{
64 unsigned long long now = task_rq(t)->clock, delta = 0; 64 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct task_struct *t)
81{ 81{
82 unsigned long long now = task_rq(t)->clock, delta = 0; 82 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock; 103 t->sched_info.last_queued = rq_clock(task_rq(t));
104} 104}
105 105
106/* 106/*
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)
112 */ 112 */
113static inline void sched_info_depart(struct task_struct *t) 113static inline void sched_info_depart(struct task_struct *t)
114{ 114{
115 unsigned long long delta = task_rq(t)->clock - 115 unsigned long long delta = rq_clock(task_rq(t)) -
116 t->sched_info.last_arrival; 116 t->sched_info.last_arrival;
117 117
118 rq_sched_info_depart(task_rq(t), delta); 118 rq_sched_info_depart(task_rq(t), delta);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5bed84a..e08fbeeb54b9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) { 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task; 31 stop->se.exec_start = rq_clock_task(rq);
32 return stop; 32 return stop;
33 } 33 }
34 34
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
57 struct task_struct *curr = rq->curr; 57 struct task_struct *curr = rq->curr;
58 u64 delta_exec; 58 u64 delta_exec;
59 59
60 delta_exec = rq->clock_task - curr->se.exec_start; 60 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0)) 61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0; 62 delta_exec = 0;
63 63
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
67 curr->se.sum_exec_runtime += delta_exec; 67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec); 68 account_group_exec_runtime(curr, delta_exec);
69 69
70 curr->se.exec_start = rq->clock_task; 70 curr->se.exec_start = rq_clock_task(rq);
71 cpuacct_charge(curr, delta_exec); 71 cpuacct_charge(curr, delta_exec);
72} 72}
73 73
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
79{ 79{
80 struct task_struct *stop = rq->stop; 80 struct task_struct *stop = rq->stop;
81 81
82 stop->se.exec_start = rq->clock_task; 82 stop->se.exec_start = rq_clock_task(rq);
83} 83}
84 84
85static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..ca25e6e704a2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
127 127
128void local_bh_disable(void) 128void local_bh_disable(void)
129{ 129{
130 __local_bh_disable((unsigned long)__builtin_return_address(0), 130 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
131 SOFTIRQ_DISABLE_OFFSET);
132} 131}
133 132
134EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
@@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt)
139 WARN_ON_ONCE(!irqs_disabled()); 138 WARN_ON_ONCE(!irqs_disabled());
140 139
141 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
142 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 141 trace_softirqs_on(_RET_IP_);
143 sub_preempt_count(cnt); 142 sub_preempt_count(cnt);
144} 143}
145 144
@@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
184 183
185void local_bh_enable(void) 184void local_bh_enable(void)
186{ 185{
187 _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 186 _local_bh_enable_ip(_RET_IP_);
188} 187}
189EXPORT_SYMBOL(local_bh_enable); 188EXPORT_SYMBOL(local_bh_enable);
190 189
@@ -195,8 +194,12 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 194EXPORT_SYMBOL(local_bh_enable_ip);
196 195
197/* 196/*
198 * We restart softirq processing for at most 2 ms, 197 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
199 * and if need_resched() is not set. 198 * but break the loop if need_resched() is set or after 2 ms.
199 * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
200 * certain cases, such as stop_machine(), jiffies may cease to
201 * increment and so we need the MAX_SOFTIRQ_RESTART limit as
202 * well to make sure we eventually return from this method.
200 * 203 *
201 * These limits have been established via experimentation. 204 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 205 * The two things to balance is latency against fairness -
@@ -204,6 +207,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
204 * should not be able to lock up the box. 207 * should not be able to lock up the box.
205 */ 208 */
206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 209#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
210#define MAX_SOFTIRQ_RESTART 10
207 211
208asmlinkage void __do_softirq(void) 212asmlinkage void __do_softirq(void)
209{ 213{
@@ -212,6 +216,7 @@ asmlinkage void __do_softirq(void)
212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 216 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 217 int cpu;
214 unsigned long old_flags = current->flags; 218 unsigned long old_flags = current->flags;
219 int max_restart = MAX_SOFTIRQ_RESTART;
215 220
216 /* 221 /*
217 * Mask out PF_MEMALLOC s current task context is borrowed for the 222 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -223,8 +228,7 @@ asmlinkage void __do_softirq(void)
223 pending = local_softirq_pending(); 228 pending = local_softirq_pending();
224 account_irq_enter_time(current); 229 account_irq_enter_time(current);
225 230
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 231 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
227 SOFTIRQ_OFFSET);
228 lockdep_softirq_enter(); 232 lockdep_softirq_enter();
229 233
230 cpu = smp_processor_id(); 234 cpu = smp_processor_id();
@@ -265,7 +269,8 @@ restart:
265 269
266 pending = local_softirq_pending(); 270 pending = local_softirq_pending();
267 if (pending) { 271 if (pending) {
268 if (time_before(jiffies, end) && !need_resched()) 272 if (time_before(jiffies, end) && !need_resched() &&
273 --max_restart)
269 goto restart; 274 goto restart;
270 275
271 wakeup_softirqd(); 276 wakeup_softirqd();
diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba21..2bbd9a73b54c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
362} 362}
363EXPORT_SYMBOL(unregister_reboot_notifier); 363EXPORT_SYMBOL(unregister_reboot_notifier);
364 364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
365/** 388/**
366 * kernel_restart - reboot the system 389 * kernel_restart - reboot the system
367 * @cmd: pointer to buffer containing command to execute for restart 390 * @cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
373void kernel_restart(char *cmd) 396void kernel_restart(char *cmd)
374{ 397{
375 kernel_restart_prepare(cmd); 398 kernel_restart_prepare(cmd);
376 disable_nonboot_cpus(); 399 migrate_to_reboot_cpu();
377 syscore_shutdown(); 400 syscore_shutdown();
378 if (!cmd) 401 if (!cmd)
379 printk(KERN_EMERG "Restarting system.\n"); 402 printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
400void kernel_halt(void) 423void kernel_halt(void)
401{ 424{
402 kernel_shutdown_prepare(SYSTEM_HALT); 425 kernel_shutdown_prepare(SYSTEM_HALT);
403 disable_nonboot_cpus(); 426 migrate_to_reboot_cpu();
404 syscore_shutdown(); 427 syscore_shutdown();
405 printk(KERN_EMERG "System halted.\n"); 428 printk(KERN_EMERG "System halted.\n");
406 kmsg_dump(KMSG_DUMP_HALT); 429 kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
419 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
420 if (pm_power_off_prepare) 443 if (pm_power_off_prepare)
421 pm_power_off_prepare(); 444 pm_power_off_prepare();
422 disable_nonboot_cpus(); 445 migrate_to_reboot_cpu();
423 syscore_shutdown(); 446 syscore_shutdown();
424 printk(KERN_EMERG "Power down.\n"); 447 printk(KERN_EMERG "Power down.\n");
425 kmsg_dump(KMSG_DUMP_POWEROFF); 448 kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..4ce13c3cedb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
120/* Constants used for minimum and maximum */ 120/* Constants used for minimum and maximum */
121#ifdef CONFIG_LOCKUP_DETECTOR 121#ifdef CONFIG_LOCKUP_DETECTOR
122static int sixty = 60; 122static int sixty = 60;
123static int neg_one = -1;
124#endif 123#endif
125 124
126static int zero; 125static int zero;
@@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = {
814 .maxlen = sizeof(int), 813 .maxlen = sizeof(int),
815 .mode = 0644, 814 .mode = 0644,
816 .proc_handler = proc_dowatchdog, 815 .proc_handler = proc_dowatchdog,
817 .extra1 = &neg_one, 816 .extra1 = &zero,
818 .extra2 = &sixty, 817 .extra2 = &sixty,
819 }, 818 },
820 { 819 {
@@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = {
1044 .mode = 0644, 1043 .mode = 0644,
1045 .proc_handler = perf_proc_update_handler, 1044 .proc_handler = perf_proc_update_handler,
1046 }, 1045 },
1046 {
1047 .procname = "perf_cpu_time_max_percent",
1048 .data = &sysctl_perf_cpu_time_max_percent,
1049 .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
1050 .mode = 0644,
1051 .proc_handler = perf_cpu_time_max_percent_handler,
1052 .extra1 = &zero,
1053 .extra2 = &one_hundred,
1054 },
1047#endif 1055#endif
1048#ifdef CONFIG_KMEMCHECK 1056#ifdef CONFIG_KMEMCHECK
1049 { 1057 {
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..7c7964c33ae7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
11 * Modification history kernel/time.c 11 * Modification history kernel/time.c
12 * 12 *
13 * 1993-09-02 Philip Gladstone 13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex() 14 * Created file with time related functions from sched/core.c and adjtimex()
15 * 1993-10-08 Torsten Duwe 15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code 16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe 17 * 1995-08-13 Torsten Duwe
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 12ff13a838c6..8f5b3b98577b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error)
874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
875{ 875{
876 struct pps_normtime pts_norm, freq_norm; 876 struct pps_normtime pts_norm, freq_norm;
877 unsigned long flags;
878 877
879 pts_norm = pps_normalize_ts(*phase_ts); 878 pts_norm = pps_normalize_ts(*phase_ts);
880 879
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 24938d577669..20d6fba70652 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -511,6 +511,12 @@ again:
511 } 511 }
512 } 512 }
513 513
514 /*
515 * Remove the current cpu from the pending mask. The event is
516 * delivered immediately in tick_do_broadcast() !
517 */
518 cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
519
514 /* Take care of enforced broadcast requests */ 520 /* Take care of enforced broadcast requests */
515 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); 521 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
516 cpumask_clear(tick_broadcast_force_mask); 522 cpumask_clear(tick_broadcast_force_mask);
@@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
575 581
576 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 582 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
577 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 583 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
578 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
579 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { 584 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
585 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
580 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 586 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
581 /* 587 /*
582 * We only reprogram the broadcast timer if we 588 * We only reprogram the broadcast timer if we
@@ -593,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
593 } else { 599 } else {
594 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 600 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
595 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 601 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
596 if (dev->next_event.tv64 == KTIME_MAX)
597 goto out;
598 /* 602 /*
599 * The cpu which was handling the broadcast 603 * The cpu which was handling the broadcast
600 * timer marked this cpu in the broadcast 604 * timer marked this cpu in the broadcast
@@ -609,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)
609 goto out; 613 goto out;
610 614
611 /* 615 /*
616 * Bail out if there is no next event.
617 */
618 if (dev->next_event.tv64 == KTIME_MAX)
619 goto out;
620 /*
612 * If the pending bit is not set, then we are 621 * If the pending bit is not set, then we are
613 * either the CPU handling the broadcast 622 * either the CPU handling the broadcast
614 * interrupt or we got woken by something else. 623 * interrupt or we got woken by something else.
@@ -692,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
692 701
693 bc->event_handler = tick_handle_oneshot_broadcast; 702 bc->event_handler = tick_handle_oneshot_broadcast;
694 703
695 /* Take the do_timer update */
696 if (!tick_nohz_full_cpu(cpu))
697 tick_do_timer_cpu = cpu;
698
699 /* 704 /*
700 * We must be careful here. There might be other CPUs 705 * We must be careful here. There might be other CPUs
701 * waiting for periodic broadcast. We need to set the 706 * waiting for periodic broadcast. We need to set the
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4208138fbf4..0cf1c1453181 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
306 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
307 */ 307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL; 309 return NOTIFY_BAD;
310 break; 310 break;
311 } 311 }
312 return NOTIFY_OK; 312 return NOTIFY_OK;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 98cd470bbe49..baeeb5c87cf1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -975,6 +975,14 @@ static int timekeeping_suspend(void)
975 975
976 read_persistent_clock(&timekeeping_suspend_time); 976 read_persistent_clock(&timekeeping_suspend_time);
977 977
978 /*
979 * On some systems the persistent_clock can not be detected at
980 * timekeeping_init by its return value, so if we see a valid
981 * value returned, update the persistent_clock_exists flag.
982 */
983 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
984 persistent_clock_exist = true;
985
978 raw_spin_lock_irqsave(&timekeeper_lock, flags); 986 raw_spin_lock_irqsave(&timekeeper_lock, flags);
979 write_seqcount_begin(&timekeeper_seq); 987 write_seqcount_begin(&timekeeper_seq);
980 timekeeping_forward_now(tk); 988 timekeeping_forward_now(tk);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b549b0f5b977..6c508ff33c62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
120 120
121/* 121/*
122 * Traverse the ftrace_global_list, invoking all entries. The reason that we 122 * Traverse the ftrace_global_list, invoking all entries. The reason that we
123 * can use rcu_dereference_raw() is that elements removed from this list 123 * can use rcu_dereference_raw_notrace() is that elements removed from this list
124 * are simply leaked, so there is no need to interact with a grace-period 124 * are simply leaked, so there is no need to interact with a grace-period
125 * mechanism. The rcu_dereference_raw() calls are needed to handle 125 * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
126 * concurrent insertions into the ftrace_global_list. 126 * concurrent insertions into the ftrace_global_list.
127 * 127 *
128 * Silly Alpha and silly pointer-speculation compiler optimizations! 128 * Silly Alpha and silly pointer-speculation compiler optimizations!
129 */ 129 */
130#define do_for_each_ftrace_op(op, list) \ 130#define do_for_each_ftrace_op(op, list) \
131 op = rcu_dereference_raw(list); \ 131 op = rcu_dereference_raw_notrace(list); \
132 do 132 do
133 133
134/* 134/*
135 * Optimized for just a single item in the list (as that is the normal case). 135 * Optimized for just a single item in the list (as that is the normal case).
136 */ 136 */
137#define while_for_each_ftrace_op(op) \ 137#define while_for_each_ftrace_op(op) \
138 while (likely(op = rcu_dereference_raw((op)->next)) && \ 138 while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
139 unlikely((op) != &ftrace_list_end)) 139 unlikely((op) != &ftrace_list_end))
140 140
141static inline void ftrace_ops_init(struct ftrace_ops *ops) 141static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
779 if (hlist_empty(hhd)) 779 if (hlist_empty(hhd))
780 return NULL; 780 return NULL;
781 781
782 hlist_for_each_entry_rcu(rec, hhd, node) { 782 hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
783 if (rec->ip == ip) 783 if (rec->ip == ip)
784 return rec; 784 return rec;
785 } 785 }
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1165 1165
1166 hhd = &hash->buckets[key]; 1166 hhd = &hash->buckets[key];
1167 1167
1168 hlist_for_each_entry_rcu(entry, hhd, hlist) { 1168 hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
1169 if (entry->ip == ip) 1169 if (entry->ip == ip)
1170 return entry; 1170 return entry;
1171 } 1171 }
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1422 struct ftrace_hash *notrace_hash; 1422 struct ftrace_hash *notrace_hash;
1423 int ret; 1423 int ret;
1424 1424
1425 filter_hash = rcu_dereference_raw(ops->filter_hash); 1425 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
1426 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1426 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
1427 1427
1428 if ((ftrace_hash_empty(filter_hash) || 1428 if ((ftrace_hash_empty(filter_hash) ||
1429 ftrace_lookup_ip(filter_hash, ip)) && 1429 ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2920 * on the hash. rcu_read_lock is too dangerous here. 2920 * on the hash. rcu_read_lock is too dangerous here.
2921 */ 2921 */
2922 preempt_disable_notrace(); 2922 preempt_disable_notrace();
2923 hlist_for_each_entry_rcu(entry, hhd, node) { 2923 hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
2924 if (entry->ip == ip) 2924 if (entry->ip == ip)
2925 entry->ops->func(ip, parent_ip, &entry->data); 2925 entry->ops->func(ip, parent_ip, &entry->data);
2926 } 2926 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4d79485b3237..e71a8be4a6ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
652 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
653}; 653};
654 654
655int trace_clock_id;
656
657/* 655/*
658 * trace_parser_get_init - gets the buffer for trace parser 656 * trace_parser_get_init - gets the buffer for trace parser
659 */ 657 */
@@ -843,7 +841,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
843 841
844 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); 842 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
845 max_data->pid = tsk->pid; 843 max_data->pid = tsk->pid;
846 max_data->uid = task_uid(tsk); 844 /*
845 * If tsk == current, then use current_uid(), as that does not use
846 * RCU. The irq tracer can be called out of RCU scope.
847 */
848 if (tsk == current)
849 max_data->uid = current_uid();
850 else
851 max_data->uid = task_uid(tsk);
852
847 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 853 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
848 max_data->policy = tsk->policy; 854 max_data->policy = tsk->policy;
849 max_data->rt_priority = tsk->rt_priority; 855 max_data->rt_priority = tsk->rt_priority;
@@ -2818,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2818 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2824 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2819 2825
2820 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2826 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2821 if (trace_clocks[trace_clock_id].in_ns) 2827 if (trace_clocks[tr->clock_id].in_ns)
2822 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2828 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2823 2829
2824 /* stop the trace while dumping if we are not opening "snapshot" */ 2830 /* stop the trace while dumping if we are not opening "snapshot" */
@@ -3817,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3817 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3823 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3818 3824
3819 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 3825 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3820 if (trace_clocks[trace_clock_id].in_ns) 3826 if (trace_clocks[tr->clock_id].in_ns)
3821 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3827 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3822 3828
3823 iter->cpu_file = tc->cpu; 3829 iter->cpu_file = tc->cpu;
@@ -5087,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5087 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); 5093 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
5088 trace_seq_printf(s, "bytes: %ld\n", cnt); 5094 trace_seq_printf(s, "bytes: %ld\n", cnt);
5089 5095
5090 if (trace_clocks[trace_clock_id].in_ns) { 5096 if (trace_clocks[tr->clock_id].in_ns) {
5091 /* local or global for trace_clock */ 5097 /* local or global for trace_clock */
5092 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); 5098 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
5093 usec_rem = do_div(t, USEC_PER_SEC); 5099 usec_rem = do_div(t, USEC_PER_SEC);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f1..20572ed88c5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
700 700
701extern unsigned long trace_flags; 701extern unsigned long trace_flags;
702 702
703extern int trace_clock_id;
704
705/* Standard output formatting function used for function return traces */ 703/* Standard output formatting function used for function return traces */
706#ifdef CONFIG_FUNCTION_GRAPH_TRACER 704#ifdef CONFIG_FUNCTION_GRAPH_TRACER
707 705
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 55e2cf66967b..2901e3b88590 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
1159 /* stop the tracing. */ 1159 /* stop the tracing. */
1160 tracing_stop(); 1160 tracing_stop();
1161 /* check the trace buffer */ 1161 /* check the trace buffer */
1162 ret = trace_test_buffer(tr, &count); 1162 ret = trace_test_buffer(&tr->trace_buffer, &count);
1163 trace->reset(tr); 1163 trace->reset(tr);
1164 tracing_start(); 1164 tracing_start();
1165 1165
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ead..ce0daa320a26 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)
287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; 287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
288} 288}
289EXPORT_SYMBOL(bit_waitqueue); 289EXPORT_SYMBOL(bit_waitqueue);
290
291/*
292 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
293 * index (we're keying off bit -1, but that would produce a horrible hash
294 * value).
295 */
296static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
297{
298 if (BITS_PER_LONG == 64) {
299 unsigned long q = (unsigned long)p;
300 return bit_waitqueue((void *)(q & ~1), q & 1);
301 }
302 return bit_waitqueue(p, 0);
303}
304
305static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
306 void *arg)
307{
308 struct wait_bit_key *key = arg;
309 struct wait_bit_queue *wait_bit
310 = container_of(wait, struct wait_bit_queue, wait);
311 atomic_t *val = key->flags;
312
313 if (wait_bit->key.flags != key->flags ||
314 wait_bit->key.bit_nr != key->bit_nr ||
315 atomic_read(val) != 0)
316 return 0;
317 return autoremove_wake_function(wait, mode, sync, key);
318}
319
320/*
321 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
322 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
323 * return codes halt waiting and return.
324 */
325static __sched
326int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
327 int (*action)(atomic_t *), unsigned mode)
328{
329 atomic_t *val;
330 int ret = 0;
331
332 do {
333 prepare_to_wait(wq, &q->wait, mode);
334 val = q->key.flags;
335 if (atomic_read(val) == 0)
336 ret = (*action)(val);
337 } while (!ret && atomic_read(val) != 0);
338 finish_wait(wq, &q->wait);
339 return ret;
340}
341
342#define DEFINE_WAIT_ATOMIC_T(name, p) \
343 struct wait_bit_queue name = { \
344 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
345 .wait = { \
346 .private = current, \
347 .func = wake_atomic_t_function, \
348 .task_list = \
349 LIST_HEAD_INIT((name).wait.task_list), \
350 }, \
351 }
352
353__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
354 unsigned mode)
355{
356 wait_queue_head_t *wq = atomic_t_waitqueue(p);
357 DEFINE_WAIT_ATOMIC_T(wait, p);
358
359 return __wait_on_atomic_t(wq, &wait, action, mode);
360}
361EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
362
363/**
364 * wake_up_atomic_t - Wake up a waiter on a atomic_t
365 * @word: The word being waited on, a kernel virtual address
366 * @bit: The bit of the word being waited on
367 *
368 * Wake up anyone waiting for the atomic_t to go to zero.
369 *
370 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
371 * check is done by the waiter's wake function, not the by the waker itself).
372 */
373void wake_up_atomic_t(atomic_t *p)
374{
375 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
376}
377EXPORT_SYMBOL(wake_up_atomic_t);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320c..f02c4a4a0c3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
272static bool wq_disable_numa; 272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444); 273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274 274
275/* see the comment above the definition of WQ_POWER_EFFICIENT */
276#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
277static bool wq_power_efficient = true;
278#else
279static bool wq_power_efficient;
280#endif
281
282module_param_named(power_efficient, wq_power_efficient, bool, 0444);
283
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 284static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276 285
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ 286/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
305EXPORT_SYMBOL_GPL(system_unbound_wq); 314EXPORT_SYMBOL_GPL(system_unbound_wq);
306struct workqueue_struct *system_freezable_wq __read_mostly; 315struct workqueue_struct *system_freezable_wq __read_mostly;
307EXPORT_SYMBOL_GPL(system_freezable_wq); 316EXPORT_SYMBOL_GPL(system_freezable_wq);
317struct workqueue_struct *system_power_efficient_wq __read_mostly;
318EXPORT_SYMBOL_GPL(system_power_efficient_wq);
319struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
320EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
308 321
309static int worker_thread(void *__worker); 322static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to, 323static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4086 struct workqueue_struct *wq; 4099 struct workqueue_struct *wq;
4087 struct pool_workqueue *pwq; 4100 struct pool_workqueue *pwq;
4088 4101
4102 /* see the comment above the definition of WQ_POWER_EFFICIENT */
4103 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
4104 flags |= WQ_UNBOUND;
4105
4089 /* allocate wq and format name */ 4106 /* allocate wq and format name */
4090 if (flags & WQ_UNBOUND) 4107 if (flags & WQ_UNBOUND)
4091 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4108 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void)
4985 WQ_UNBOUND_MAX_ACTIVE); 5002 WQ_UNBOUND_MAX_ACTIVE);
4986 system_freezable_wq = alloc_workqueue("events_freezable", 5003 system_freezable_wq = alloc_workqueue("events_freezable",
4987 WQ_FREEZABLE, 0); 5004 WQ_FREEZABLE, 0);
5005 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
5006 WQ_POWER_EFFICIENT, 0);
5007 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
5008 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
5009 0);
4988 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 5010 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
4989 !system_unbound_wq || !system_freezable_wq); 5011 !system_unbound_wq || !system_freezable_wq ||
5012 !system_power_efficient_wq ||
5013 !system_freezable_power_efficient_wq);
4990 return 0; 5014 return 0;
4991} 5015}
4992early_initcall(init_workqueues); 5016early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index ad83c96b2ece..7e2204db0b1a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)
64 64
65/* 65/*
66 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
67 * sched.c and workqueue.c. 67 * sched/core.c and workqueue.c.
68 */ 68 */
69void wq_worker_waking_up(struct task_struct *task, int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);