aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/hrtimer.c121
-rw-r--r--kernel/hw_breakpoint.c146
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kgdb.c56
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/perf_event.c79
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/resource.c26
-rw-r--r--kernel/sched.c218
-rw-r--r--kernel/sched_debug.c13
-rw-r--r--kernel/sched_fair.c155
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/sys.c14
-rw-r--r--kernel/sysctl.c30
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timer_list.c5
-rw-r--r--kernel/trace/trace_kprobe.c37
-rw-r--r--kernel/trace/trace_ksym.c5
-rw-r--r--kernel/workqueue.c131
25 files changed, 716 insertions, 417 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7c4e2713df0a..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 213 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 214 if (err == NOTIFY_BAD) {
215 set_cpu_active(cpu, true);
216
215 nr_calls--; 217 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 218 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 219 hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 225
224 /* Ensure that we are not runnable on dying cpu */ 226 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 227 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 228 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 229
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 230 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 231 if (err) {
232 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 233 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 234 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 235 hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
292 294
293 err = _cpu_down(cpu, 0); 295 err = _cpu_down(cpu, 0);
294 296
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 297out:
299 cpu_maps_update_done(); 298 cpu_maps_update_done();
300 stop_machine_destroy(); 299 stop_machine_destroy();
@@ -387,6 +386,15 @@ int disable_nonboot_cpus(void)
387 * with the userspace trying to use the CPU hotplug at the same time 386 * with the userspace trying to use the CPU hotplug at the same time
388 */ 387 */
389 cpumask_clear(frozen_cpus); 388 cpumask_clear(frozen_cpus);
389
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
390 printk("Disabling non-boot CPUs ...\n"); 398 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 399 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 400 if (cpu == first_cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472d..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
737{ 737{
738} 738}
739 739
740static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
741 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
742{ 742{
743 *domains = NULL; 743 *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
872 if (retval < 0) 872 if (retval < 0)
873 return retval; 873 return retval;
874 874
875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
876 return -EINVAL; 876 return -EINVAL;
877 } 877 }
878 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2010 } 2010 }
2011 2011
2012 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2013 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2015 continue;
2016 2016
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2019 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2022 cpu_online_mask); 2022 cpu_active_mask);
2023 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2024 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2025 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2057 switch (phase) { 2057 switch (phase) {
2058 case CPU_ONLINE: 2058 case CPU_ONLINE:
2059 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2060 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2061 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2062 break; 2064 break;
2063 2065
2064 default: 2066 default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2067 2069
2068 cgroup_lock(); 2070 cgroup_lock();
2069 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2070 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2071 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2072 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2073 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2114 2116
2115void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2116{ 2118{
2117 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2118 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2119 2121
2120 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e9..6f50ef55a6f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code)
971 exit_thread(); 971 exit_thread();
972 cgroup_exit(tsk, 1); 972 cgroup_exit(tsk, 1);
973 973
974 if (group_dead && tsk->signal->leader) 974 if (group_dead)
975 disassociate_ctty(1); 975 disassociate_ctty(1);
976 976
977 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..d73ef1f3e55d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 304 */
305static int fault_in_user_writeable(u32 __user *uaddr) 305static int fault_in_user_writeable(u32 __user *uaddr)
306{ 306{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 307 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 308 int ret;
309
310 down_read(&mm->mmap_sem);
311 ret = get_user_pages(current, mm, (unsigned long)uaddr,
312 1, 1, 0, NULL, NULL);
313 up_read(&mm->mmap_sem);
314
309 return ret < 0 ? ret : 0; 315 return ret < 0 ? ret : 0;
310} 316}
311 317
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ede527708123..d2f9239dc6ba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -1217,30 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %llu ns\n",
1242 (unsigned long long) dev->min_delta_ns);
1243}
1244/* 1245/*
1245 * High resolution timer interrupt 1246 * High resolution timer interrupt
1246 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1249,21 +1250,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1249{ 1250{
1250 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1251 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1252 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1253 int nr_retries = 0; 1254 int i, retries = 0;
1254 int i;
1255 1255
1256 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1257 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1258 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1259 1259
1260 retry: 1260 entry_time = now = ktime_get();
1261 /* 5 retries is enough to notice a hang */ 1261retry:
1262 if (!(++nr_retries % 5))
1263 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1264
1265 now = ktime_get();
1266
1267 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1268 1263
1269 spin_lock(&cpu_base->lock); 1264 spin_lock(&cpu_base->lock);
@@ -1325,10 +1320,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1325 spin_unlock(&cpu_base->lock); 1320 spin_unlock(&cpu_base->lock);
1326 1321
1327 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1328 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1329 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1330 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1331 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1332} 1365}
1333 1366
1334/* 1367/*
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee1628411..366eedf949c0 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -52,7 +52,7 @@
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53 53
54/* Number of pinned task breakpoints in a cpu */ 54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); 55static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
56 56
57/* Number of non-pinned cpu/task breakpoints in a cpu */ 57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
73static unsigned int max_task_bp_pinned(int cpu) 73static unsigned int max_task_bp_pinned(int cpu)
74{ 74{
75 int i; 75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); 76 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
77 77
78 for (i = HBP_NUM -1; i >= 0; i--) { 78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0) 79 if (tsk_pinned[i] > 0)
@@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu)
83 return 0; 83 return 0;
84} 84}
85 85
86static int task_bp_pinned(struct task_struct *tsk)
87{
88 struct perf_event_context *ctx = tsk->perf_event_ctxp;
89 struct list_head *list;
90 struct perf_event *bp;
91 unsigned long flags;
92 int count = 0;
93
94 if (WARN_ONCE(!ctx, "No perf context for this task"))
95 return 0;
96
97 list = &ctx->event_list;
98
99 spin_lock_irqsave(&ctx->lock, flags);
100
101 /*
102 * The current breakpoint counter is not included in the list
103 * at the open() callback time
104 */
105 list_for_each_entry(bp, list, event_entry) {
106 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
107 count++;
108 }
109
110 spin_unlock_irqrestore(&ctx->lock, flags);
111
112 return count;
113}
114
86/* 115/*
87 * Report the number of pinned/un-pinned breakpoints we have in 116 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1). 117 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */ 118 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) 119static void
120fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
91{ 121{
122 int cpu = bp->cpu;
123 struct task_struct *tsk = bp->ctx->task;
124
92 if (cpu >= 0) { 125 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 126 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu); 127 if (!tsk)
128 slots->pinned += max_task_bp_pinned(cpu);
129 else
130 slots->pinned += task_bp_pinned(tsk);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu); 131 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96 132
97 return; 133 return;
@@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
101 unsigned int nr; 137 unsigned int nr;
102 138
103 nr = per_cpu(nr_cpu_bp_pinned, cpu); 139 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu); 140 if (!tsk)
141 nr += max_task_bp_pinned(cpu);
142 else
143 nr += task_bp_pinned(tsk);
105 144
106 if (nr > slots->pinned) 145 if (nr > slots->pinned)
107 slots->pinned = nr; 146 slots->pinned = nr;
@@ -118,35 +157,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
118 */ 157 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 158static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{ 159{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned; 160 unsigned int *tsk_pinned;
125 struct list_head *list; 161 int count = 0;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143 162
144 spin_unlock_irqrestore(&ctx->lock, flags); 163 count = task_bp_pinned(tsk);
145 164
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) 165 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) { 166 if (enable) {
151 tsk_pinned[count]++; 167 tsk_pinned[count]++;
152 if (count > 0) 168 if (count > 0)
@@ -193,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
193 * - If attached to a single cpu, check: 209 * - If attached to a single cpu, check:
194 * 210 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 211 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM 212 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
197 * 213 *
198 * -> If there are already non-pinned counters in this cpu, it means 214 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them. 215 * there is already a free slot for them.
@@ -204,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
204 * - If attached to every cpus, check: 220 * - If attached to every cpus, check:
205 * 221 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 222 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM 223 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
208 * 224 *
209 * -> This is roughly the same, except we check the number of per cpu 225 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks 226 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
216 * - If attached to a single cpu, check: 232 * - If attached to a single cpu, check:
217 * 233 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 234 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM 235 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
220 * 236 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 237 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed). 238 * one register at least (or they will never be fed).
@@ -224,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
224 * - If attached to every cpus, check: 240 * - If attached to every cpus, check:
225 * 241 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 242 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM 243 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
228 */ 244 */
229int reserve_bp_slot(struct perf_event *bp) 245int reserve_bp_slot(struct perf_event *bp)
230{ 246{
@@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp)
233 249
234 mutex_lock(&nr_bp_mutex); 250 mutex_lock(&nr_bp_mutex);
235 251
236 fetch_bp_busy_slots(&slots, bp->cpu); 252 fetch_bp_busy_slots(&slots, bp);
237 253
238 /* Flexible counters need to keep at least one slot */ 254 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 255 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
@@ -259,7 +275,7 @@ void release_bp_slot(struct perf_event *bp)
259} 275}
260 276
261 277
262int __register_perf_hw_breakpoint(struct perf_event *bp) 278int register_perf_hw_breakpoint(struct perf_event *bp)
263{ 279{
264 int ret; 280 int ret;
265 281
@@ -276,19 +292,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
276 * This is a quick hack that will be removed soon, once we remove 292 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace 293 * the tmp breakpoints from ptrace
278 */ 294 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event) 295 if (!bp->attr.disabled || !bp->overflow_handler)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 296 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281 297
282 return ret; 298 return ret;
283} 299}
284 300
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/** 301/**
293 * register_user_hw_breakpoint - register a hardware breakpoint for user space 302 * register_user_hw_breakpoint - register a hardware breakpoint for user space
294 * @attr: breakpoint attributes 303 * @attr: breakpoint attributes
@@ -297,7 +306,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
297 */ 306 */
298struct perf_event * 307struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr, 308register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered, 309 perf_overflow_handler_t triggered,
301 struct task_struct *tsk) 310 struct task_struct *tsk)
302{ 311{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 312 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
311 * @triggered: callback to trigger when we hit the breakpoint 320 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs 321 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */ 322 */
314struct perf_event * 323int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{ 324{
319 /* 325 u64 old_addr = bp->attr.bp_addr;
320 * FIXME: do it without unregistering 326 int old_type = bp->attr.bp_type;
321 * - We don't want to lose our slot 327 int old_len = bp->attr.bp_len;
322 * - If the new bp is incorrect, don't lose the older one 328 int err = 0;
323 */
324 unregister_hw_breakpoint(bp);
325 329
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 330 perf_event_disable(bp);
331
332 bp->attr.bp_addr = attr->bp_addr;
333 bp->attr.bp_type = attr->bp_type;
334 bp->attr.bp_len = attr->bp_len;
335
336 if (attr->disabled)
337 goto end;
338
339 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
340 if (!err)
341 perf_event_enable(bp);
342
343 if (err) {
344 bp->attr.bp_addr = old_addr;
345 bp->attr.bp_type = old_type;
346 bp->attr.bp_len = old_len;
347 if (!bp->attr.disabled)
348 perf_event_enable(bp);
349
350 return err;
351 }
352
353end:
354 bp->attr.disabled = attr->disabled;
355
356 return 0;
327} 357}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); 358EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329 359
@@ -348,7 +378,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
348 */ 378 */
349struct perf_event ** 379struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr, 380register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered) 381 perf_overflow_handler_t triggered)
352{ 382{
353 struct perf_event **cpu_events, **pevent, *bp; 383 struct perf_event **cpu_events, **pevent, *bp;
354 long err; 384 long err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf24..e49ea1c5232d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
220 /* 220 /*
221 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
222 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
223 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
224 * working systems 224 * working systems
225 */ 225 */
226 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d7014634022..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 625static int kgdb_activate_sw_breakpoints(void)
620{ 626{
621 unsigned long addr; 627 unsigned long addr;
622 int error = 0; 628 int error;
629 int ret = 0;
623 int i; 630 int i;
624 631
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 632 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 636 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 637 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 638 kgdb_break[i].saved_instr);
632 if (error) 639 if (error) {
633 return error; 640 ret = error;
641 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
642 continue;
643 }
634 644
635 kgdb_flush_swbreak_addr(addr); 645 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 646 kgdb_break[i].state = BP_ACTIVE;
637 } 647 }
638 return 0; 648 return ret;
639} 649}
640 650
641static int kgdb_set_sw_break(unsigned long addr) 651static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 692static int kgdb_deactivate_sw_breakpoints(void)
683{ 693{
684 unsigned long addr; 694 unsigned long addr;
685 int error = 0; 695 int error;
696 int ret = 0;
686 int i; 697 int i;
687 698
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 699 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 702 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 703 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 704 kgdb_break[i].saved_instr);
694 if (error) 705 if (error) {
695 return error; 706 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
707 ret = error;
708 }
696 709
697 kgdb_flush_swbreak_addr(addr); 710 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 711 kgdb_break[i].state = BP_SET;
699 } 712 }
700 return 0; 713 return ret;
701} 714}
702 715
703static int kgdb_remove_sw_break(unsigned long addr) 716static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1217 return 1;
1205 1218
1206 } else { 1219 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1220 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1221 " and 15 (pass and disconnect)\n"
1222 "Executing a continue without signal passing\n", 0);
1223 remcom_in_buffer[0] = 'c';
1209 } 1224 }
1210 1225
1211 /* Indicate fall through */ 1226 /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1410 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1411 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1412 unsigned long flags;
1413 int sstep_tries = 100;
1398 int error = 0; 1414 int error = 0;
1399 int i, cpu; 1415 int i, cpu;
1400 1416
@@ -1425,13 +1441,14 @@ acquirelock:
1425 cpu_relax(); 1441 cpu_relax();
1426 1442
1427 /* 1443 /*
1428 * Do not start the debugger connection on this CPU if the last 1444 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1445 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1446 * kernel will only try for the value of sstep_tries before
1447 * giving up and continuing on.
1431 */ 1448 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1449 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1450 (kgdb_info[cpu].task &&
1434 1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1452 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1453 touch_softlockup_watchdog();
1437 clocksource_touch_watchdog(); 1454 clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
1524 } 1541 }
1525 1542
1526kgdb_restore: 1543kgdb_restore:
1544 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1545 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1546 if (kgdb_info[sstep_cpu].task)
1547 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1548 else
1549 kgdb_sstep_pid = 0;
1550 }
1527 /* Free kgdb_active */ 1551 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1552 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1553 touch_softlockup_watchdog();
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d3151..4f8df01dbe51 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 168 if (time > lt->max)
169 lt->max = time; 169 lt->max = time;
170 170
171 if (time < lt->min || !lt->min) 171 if (time < lt->min || !lt->nr)
172 lt->min = time; 172 lt->min = time;
173 173
174 lt->total += time; 174 lt->total += time;
@@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 177
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 179{
180 dst->min += src->min; 180 if (!src->nr)
181 dst->max += src->max; 181 return;
182
183 if (src->max > dst->max)
184 dst->max = src->max;
185
186 if (src->min < dst->min || !dst->nr)
187 dst->min = src->min;
188
182 dst->total += src->total; 189 dst->total += src->total;
183 dst->nr += src->nr; 190 dst->nr += src->nr;
184} 191}
@@ -379,7 +386,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 386 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 387 * as incomplete, friggin useless </rant>
381 */ 388 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 389 if (trace->nr_entries != 0 &&
390 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 391 trace->nr_entries--;
384 392
385 trace->max_entries = trace->nr_entries; 393 trace->max_entries = trace->nr_entries;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd64..e73e53c7582f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
36/* 36/*
37 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
38 */ 38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40 40
41int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
476 if (!task) { 476 if (!task) {
477 /* 477 /*
478 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
479 * the removal is always sucessful. 479 * the removal is always successful.
480 */ 480 */
481 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
482 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -567,7 +567,7 @@ static void __perf_event_disable(void *info)
567 * is the current context on this CPU and preemption is disabled, 567 * is the current context on this CPU and preemption is disabled,
568 * hence we can't get into perf_event_task_sched_out for this context. 568 * hence we can't get into perf_event_task_sched_out for this context.
569 */ 569 */
570static void perf_event_disable(struct perf_event *event) 570void perf_event_disable(struct perf_event *event)
571{ 571{
572 struct perf_event_context *ctx = event->ctx; 572 struct perf_event_context *ctx = event->ctx;
573 struct task_struct *task = ctx->task; 573 struct task_struct *task = ctx->task;
@@ -845,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx,
845 if (!task) { 845 if (!task) {
846 /* 846 /*
847 * Per cpu events are installed via an smp call and 847 * Per cpu events are installed via an smp call and
848 * the install is always sucessful. 848 * the install is always successful.
849 */ 849 */
850 smp_call_function_single(cpu, __perf_install_in_context, 850 smp_call_function_single(cpu, __perf_install_in_context,
851 event, 1); 851 event, 1);
@@ -971,7 +971,7 @@ static void __perf_event_enable(void *info)
971 * perf_event_for_each_child or perf_event_for_each as described 971 * perf_event_for_each_child or perf_event_for_each as described
972 * for perf_event_disable. 972 * for perf_event_disable.
973 */ 973 */
974static void perf_event_enable(struct perf_event *event) 974void perf_event_enable(struct perf_event *event)
975{ 975{
976 struct perf_event_context *ctx = event->ctx; 976 struct perf_event_context *ctx = event->ctx;
977 struct task_struct *task = ctx->task; 977 struct task_struct *task = ctx->task;
@@ -1579,7 +1579,6 @@ static void
1579__perf_event_init_context(struct perf_event_context *ctx, 1579__perf_event_init_context(struct perf_event_context *ctx,
1580 struct task_struct *task) 1580 struct task_struct *task)
1581{ 1581{
1582 memset(ctx, 0, sizeof(*ctx));
1583 spin_lock_init(&ctx->lock); 1582 spin_lock_init(&ctx->lock);
1584 mutex_init(&ctx->mutex); 1583 mutex_init(&ctx->mutex);
1585 INIT_LIST_HEAD(&ctx->group_list); 1584 INIT_LIST_HEAD(&ctx->group_list);
@@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1654 } 1653 }
1655 1654
1656 if (!ctx) { 1655 if (!ctx) {
1657 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1656 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1658 err = -ENOMEM; 1657 err = -ENOMEM;
1659 if (!ctx) 1658 if (!ctx)
1660 goto errout; 1659 goto errout;
@@ -4011,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4011 event->pmu->read(event); 4010 event->pmu->read(event);
4012 4011
4013 data.addr = 0; 4012 data.addr = 0;
4013 data.raw = NULL;
4014 data.period = event->hw.last_period; 4014 data.period = event->hw.last_period;
4015 regs = get_irq_regs(); 4015 regs = get_irq_regs();
4016 /* 4016 /*
@@ -4080,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4080 u64 now; 4080 u64 now;
4081 4081
4082 now = cpu_clock(cpu); 4082 now = cpu_clock(cpu);
4083 prev = atomic64_read(&event->hw.prev_count); 4083 prev = atomic64_xchg(&event->hw.prev_count, now);
4084 atomic64_set(&event->hw.prev_count, now);
4085 atomic64_add(now - prev, &event->count); 4084 atomic64_add(now - prev, &event->count);
4086} 4085}
4087 4086
@@ -4286,15 +4285,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4285static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{ 4286{
4288 int err; 4287 int err;
4289 /* 4288
4290 * The breakpoint is already filled if we haven't created the counter 4289 err = register_perf_hw_breakpoint(bp);
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err) 4290 if (err)
4299 return ERR_PTR(err); 4291 return ERR_PTR(err);
4300 4292
@@ -4308,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4308 struct perf_sample_data sample; 4300 struct perf_sample_data sample;
4309 struct pt_regs *regs = data; 4301 struct pt_regs *regs = data;
4310 4302
4303 sample.raw = NULL;
4311 sample.addr = bp->attr.bp_addr; 4304 sample.addr = bp->attr.bp_addr;
4312 4305
4313 if (!perf_exclude_event(bp, regs)) 4306 if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4390 struct perf_event_context *ctx, 4383 struct perf_event_context *ctx,
4391 struct perf_event *group_leader, 4384 struct perf_event *group_leader,
4392 struct perf_event *parent_event, 4385 struct perf_event *parent_event,
4393 perf_callback_t callback, 4386 perf_overflow_handler_t overflow_handler,
4394 gfp_t gfpflags) 4387 gfp_t gfpflags)
4395{ 4388{
4396 const struct pmu *pmu; 4389 const struct pmu *pmu;
@@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr,
4433 4426
4434 event->state = PERF_EVENT_STATE_INACTIVE; 4427 event->state = PERF_EVENT_STATE_INACTIVE;
4435 4428
4436 if (!callback && parent_event) 4429 if (!overflow_handler && parent_event)
4437 callback = parent_event->callback; 4430 overflow_handler = parent_event->overflow_handler;
4438 4431
4439 event->callback = callback; 4432 event->overflow_handler = overflow_handler;
4440 4433
4441 if (attr->disabled) 4434 if (attr->disabled)
4442 event->state = PERF_EVENT_STATE_OFF; 4435 event->state = PERF_EVENT_STATE_OFF;
@@ -4776,7 +4769,8 @@ err_put_context:
4776 */ 4769 */
4777struct perf_event * 4770struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 4771perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback) 4772 pid_t pid,
4773 perf_overflow_handler_t overflow_handler)
4780{ 4774{
4781 struct perf_event *event; 4775 struct perf_event *event;
4782 struct perf_event_context *ctx; 4776 struct perf_event_context *ctx;
@@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4793 } 4787 }
4794 4788
4795 event = perf_event_alloc(attr, cpu, ctx, NULL, 4789 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL); 4790 NULL, overflow_handler, GFP_KERNEL);
4797 if (IS_ERR(event)) { 4791 if (IS_ERR(event)) {
4798 err = PTR_ERR(event); 4792 err = PTR_ERR(event);
4799 goto err_put_context; 4793 goto err_put_context;
@@ -5090,7 +5084,7 @@ again:
5090 */ 5084 */
5091int perf_event_init_task(struct task_struct *child) 5085int perf_event_init_task(struct task_struct *child)
5092{ 5086{
5093 struct perf_event_context *child_ctx, *parent_ctx; 5087 struct perf_event_context *child_ctx = NULL, *parent_ctx;
5094 struct perf_event_context *cloned_ctx; 5088 struct perf_event_context *cloned_ctx;
5095 struct perf_event *event; 5089 struct perf_event *event;
5096 struct task_struct *parent = current; 5090 struct task_struct *parent = current;
@@ -5106,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child)
5106 return 0; 5100 return 0;
5107 5101
5108 /* 5102 /*
5109 * This is executed from the parent task context, so inherit
5110 * events that have been marked for cloning.
5111 * First allocate and initialize a context for the child.
5112 */
5113
5114 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5115 if (!child_ctx)
5116 return -ENOMEM;
5117
5118 __perf_event_init_context(child_ctx, child);
5119 child->perf_event_ctxp = child_ctx;
5120 get_task_struct(child);
5121
5122 /*
5123 * If the parent's context is a clone, pin it so it won't get 5103 * If the parent's context is a clone, pin it so it won't get
5124 * swapped under us. 5104 * swapped under us.
5125 */ 5105 */
@@ -5149,6 +5129,26 @@ int perf_event_init_task(struct task_struct *child)
5149 continue; 5129 continue;
5150 } 5130 }
5151 5131
5132 if (!child->perf_event_ctxp) {
5133 /*
5134 * This is executed from the parent task context, so
5135 * inherit events that have been marked for cloning.
5136 * First allocate and initialize a context for the
5137 * child.
5138 */
5139
5140 child_ctx = kzalloc(sizeof(struct perf_event_context),
5141 GFP_KERNEL);
5142 if (!child_ctx) {
5143 ret = -ENOMEM;
5144 goto exit;
5145 }
5146
5147 __perf_event_init_context(child_ctx, child);
5148 child->perf_event_ctxp = child_ctx;
5149 get_task_struct(child);
5150 }
5151
5152 ret = inherit_group(event, parent, parent_ctx, 5152 ret = inherit_group(event, parent, parent_ctx,
5153 child, child_ctx); 5153 child, child_ctx);
5154 if (ret) { 5154 if (ret) {
@@ -5177,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child)
5177 get_ctx(child_ctx->parent_ctx); 5177 get_ctx(child_ctx->parent_ctx);
5178 } 5178 }
5179 5179
5180exit:
5180 mutex_unlock(&parent_ctx->mutex); 5181 mutex_unlock(&parent_ctx->mutex);
5181 5182
5182 perf_unpin_context(parent_ctx); 5183 perf_unpin_context(parent_ctx);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 resource_size_t start, end;
311 312
312 new->start = root->start; 313 start = root->start;
313 /* 314 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to new->end below would cause an underflow.
316 */ 317 */
317 if (this && this->start == 0) { 318 if (this && this->start == 0) {
318 new->start = this->end + 1; 319 start = this->end + 1;
319 this = this->sibling; 320 this = this->sibling;
320 } 321 }
321 for(;;) { 322 for(;;) {
322 if (this) 323 if (this)
323 new->end = this->start - 1; 324 end = this->start - 1;
324 else 325 else
325 new->end = root->end; 326 end = root->end;
326 if (new->start < min) 327 if (start < min)
327 new->start = min; 328 start = min;
328 if (new->end > max) 329 if (end > max)
329 new->end = max; 330 end = max;
330 new->start = ALIGN(new->start, align); 331 start = ALIGN(start, align);
331 if (alignf) 332 if (alignf)
332 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, new, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 334 if (start < end && end - start >= size - 1) {
334 new->end = new->start + size - 1; 335 new->start = start;
336 new->end = start + size - 1;
335 return 0; 337 return 0;
336 } 338 }
337 if (!this) 339 if (!this)
338 break; 340 break;
339 new->start = this->end + 1; 341 start = this->end + 1;
340 this = this->sibling; 342 this = this->sibling;
341 } 343 }
342 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a257..ff39cadf621e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1815#endif
1811 1816
1812static void calc_load_account_active(struct rq *this_rq); 1817static void calc_load_account_active(struct rq *this_rq);
1818static void update_sysctl(void);
1819static int get_update_sysctl_factor(void);
1820
1821static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822{
1823 set_task_rq(p, cpu);
1824#ifdef CONFIG_SMP
1825 /*
1826 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1827 * successfuly executed on another CPU. We must ensure that updates of
1828 * per-task data have been completed by this moment.
1829 */
1830 smp_wmb();
1831 task_thread_info(p)->cpu = cpu;
1832#endif
1833}
1813 1834
1814#include "sched_stats.h" 1835#include "sched_stats.h"
1815#include "sched_idletask.c" 1836#include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1988 return cpu_curr(task_cpu(p)) == p;
1968} 1989}
1969 1990
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1991static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1992 const struct sched_class *prev_class,
1986 int oldprio, int running) 1993 int oldprio, int running)
@@ -2060,29 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2060void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2067void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2061{ 2068{
2062 int old_cpu = task_cpu(p); 2069 int old_cpu = task_cpu(p);
2063 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2064 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2070 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2065 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2071 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2066 u64 clock_offset;
2067
2068 clock_offset = old_rq->clock - new_rq->clock;
2069 2072
2070 trace_sched_migrate_task(p, new_cpu); 2073 trace_sched_migrate_task(p, new_cpu);
2071 2074
2072#ifdef CONFIG_SCHEDSTATS
2073 if (p->se.wait_start)
2074 p->se.wait_start -= clock_offset;
2075 if (p->se.sleep_start)
2076 p->se.sleep_start -= clock_offset;
2077 if (p->se.block_start)
2078 p->se.block_start -= clock_offset;
2079#endif
2080 if (old_cpu != new_cpu) { 2075 if (old_cpu != new_cpu) {
2081 p->se.nr_migrations++; 2076 p->se.nr_migrations++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2077 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0); 2078 1, 1, NULL, 0);
2088 } 2079 }
@@ -2323,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
2323 preempt_enable(); 2314 preempt_enable();
2324} 2315}
2325 2316
2317#ifdef CONFIG_SMP
2318static inline
2319int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2320{
2321 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2322}
2323#endif
2324
2326/*** 2325/***
2327 * try_to_wake_up - wake up a thread 2326 * try_to_wake_up - wake up a thread
2328 * @p: the to-be-woken-up thread 2327 * @p: the to-be-woken-up thread
@@ -2374,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2374 if (task_contributes_to_load(p)) 2373 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--; 2374 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING; 2375 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags); 2376 __task_rq_unlock(rq);
2378 2377
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2378 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2380 if (cpu != orig_cpu) { 2379 if (cpu != orig_cpu)
2381 local_irq_save(flags);
2382 rq = cpu_rq(cpu);
2383 update_rq_clock(rq);
2384 set_task_cpu(p, cpu); 2380 set_task_cpu(p, cpu);
2385 local_irq_restore(flags); 2381
2386 } 2382 rq = __task_rq_lock(p);
2387 rq = task_rq_lock(p, &flags); 2383 update_rq_clock(rq);
2388 2384
2389 WARN_ON(p->state != TASK_WAKING); 2385 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p); 2386 cpu = task_cpu(p);
@@ -2499,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
2499 p->se.avg_overlap = 0; 2495 p->se.avg_overlap = 0;
2500 p->se.start_runtime = 0; 2496 p->se.start_runtime = 0;
2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2497 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2503 2498
2504#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2505 p->se.wait_start = 0; 2500 p->se.wait_start = 0;
@@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
2521 p->se.nr_failed_migrations_running = 0; 2516 p->se.nr_failed_migrations_running = 0;
2522 p->se.nr_failed_migrations_hot = 0; 2517 p->se.nr_failed_migrations_hot = 0;
2523 p->se.nr_forced_migrations = 0; 2518 p->se.nr_forced_migrations = 0;
2524 p->se.nr_forced2_migrations = 0;
2525 2519
2526 p->se.nr_wakeups = 0; 2520 p->se.nr_wakeups = 0;
2527 p->se.nr_wakeups_sync = 0; 2521 p->se.nr_wakeups_sync = 0;
@@ -2558,7 +2552,6 @@ static void __sched_fork(struct task_struct *p)
2558void sched_fork(struct task_struct *p, int clone_flags) 2552void sched_fork(struct task_struct *p, int clone_flags)
2559{ 2553{
2560 int cpu = get_cpu(); 2554 int cpu = get_cpu();
2561 unsigned long flags;
2562 2555
2563 __sched_fork(p); 2556 __sched_fork(p);
2564 2557
@@ -2592,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags)
2592 if (!rt_prio(p->prio)) 2585 if (!rt_prio(p->prio))
2593 p->sched_class = &fair_sched_class; 2586 p->sched_class = &fair_sched_class;
2594 2587
2588 if (p->sched_class->task_fork)
2589 p->sched_class->task_fork(p);
2590
2595#ifdef CONFIG_SMP 2591#ifdef CONFIG_SMP
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2592 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2597#endif 2593#endif
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu); 2594 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602 2595
2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2596#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2604 if (likely(sched_info_on())) 2597 if (likely(sched_info_on()))
@@ -2631,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2631 rq = task_rq_lock(p, &flags); 2624 rq = task_rq_lock(p, &flags);
2632 BUG_ON(p->state != TASK_RUNNING); 2625 BUG_ON(p->state != TASK_RUNNING);
2633 update_rq_clock(rq); 2626 update_rq_clock(rq);
2634 2627 activate_task(rq, p, 0);
2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2636 activate_task(rq, p, 0);
2637 } else {
2638 /*
2639 * Let the scheduling class do new task startup
2640 * management (if any):
2641 */
2642 p->sched_class->task_new(rq, p);
2643 inc_nr_running(rq);
2644 }
2645 trace_sched_wakeup_new(rq, p, 1); 2628 trace_sched_wakeup_new(rq, p, 1);
2646 check_preempt_curr(rq, p, WF_FORK); 2629 check_preempt_curr(rq, p, WF_FORK);
2647#ifdef CONFIG_SMP 2630#ifdef CONFIG_SMP
@@ -3156,7 +3139,7 @@ out:
3156void sched_exec(void) 3139void sched_exec(void)
3157{ 3140{
3158 int new_cpu, this_cpu = get_cpu(); 3141 int new_cpu, this_cpu = get_cpu();
3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3142 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3160 put_cpu(); 3143 put_cpu();
3161 if (new_cpu != this_cpu) 3144 if (new_cpu != this_cpu)
3162 sched_migrate_task(current, new_cpu); 3145 sched_migrate_task(current, new_cpu);
@@ -3172,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3172 deactivate_task(src_rq, p, 0); 3155 deactivate_task(src_rq, p, 0);
3173 set_task_cpu(p, this_cpu); 3156 set_task_cpu(p, this_cpu);
3174 activate_task(this_rq, p, 0); 3157 activate_task(this_rq, p, 0);
3175 /*
3176 * Note that idle threads have a prio of MAX_PRIO, for this test
3177 * to be always true for them.
3178 */
3179 check_preempt_curr(this_rq, p, 0); 3158 check_preempt_curr(this_rq, p, 0);
3180} 3159}
3181 3160
@@ -4134,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4134 unsigned long flags; 4113 unsigned long flags;
4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4114 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4136 4115
4137 cpumask_copy(cpus, cpu_online_mask); 4116 cpumask_copy(cpus, cpu_active_mask);
4138 4117
4139 /* 4118 /*
4140 * When power savings policy is enabled for the parent domain, idle 4119 * When power savings policy is enabled for the parent domain, idle
@@ -4297,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4297 int all_pinned = 0; 4276 int all_pinned = 0;
4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4277 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4299 4278
4300 cpumask_copy(cpus, cpu_online_mask); 4279 cpumask_copy(cpus, cpu_active_mask);
4301 4280
4302 /* 4281 /*
4303 * When power savings policy is enabled for the parent domain, idle 4282 * When power savings policy is enabled for the parent domain, idle
@@ -4694,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
4694 cpumask_set_cpu(cpu, nohz.cpu_mask); 4673 cpumask_set_cpu(cpu, nohz.cpu_mask);
4695 4674
4696 /* time for ilb owner also to sleep */ 4675 /* time for ilb owner also to sleep */
4697 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4676 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4698 if (atomic_read(&nohz.load_balancer) == cpu) 4677 if (atomic_read(&nohz.load_balancer) == cpu)
4699 atomic_set(&nohz.load_balancer, -1); 4678 atomic_set(&nohz.load_balancer, -1);
4700 return 0; 4679 return 0;
@@ -5396,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
5396#endif 5375#endif
5397} 5376}
5398 5377
5399static void put_prev_task(struct rq *rq, struct task_struct *p) 5378static void put_prev_task(struct rq *rq, struct task_struct *prev)
5400{ 5379{
5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5380 if (prev->state == TASK_RUNNING) {
5381 u64 runtime = prev->se.sum_exec_runtime;
5402 5382
5403 update_avg(&p->se.avg_running, runtime); 5383 runtime -= prev->se.prev_sum_exec_runtime;
5384 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5404 5385
5405 if (p->state == TASK_RUNNING) {
5406 /* 5386 /*
5407 * In order to avoid avg_overlap growing stale when we are 5387 * In order to avoid avg_overlap growing stale when we are
5408 * indeed overlapping and hence not getting put to sleep, grow 5388 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5412 * correlates to the amount of cache footprint a task can 5392 * correlates to the amount of cache footprint a task can
5413 * build up. 5393 * build up.
5414 */ 5394 */
5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5395 update_avg(&prev->se.avg_overlap, runtime);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5419 } 5396 }
5420 p->sched_class->put_prev_task(rq, p); 5397 prev->sched_class->put_prev_task(rq, prev);
5421} 5398}
5422 5399
5423/* 5400/*
@@ -6631,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6631long sched_getaffinity(pid_t pid, struct cpumask *mask) 6608long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632{ 6609{
6633 struct task_struct *p; 6610 struct task_struct *p;
6611 unsigned long flags;
6612 struct rq *rq;
6634 int retval; 6613 int retval;
6635 6614
6636 get_online_cpus(); 6615 get_online_cpus();
@@ -6645,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6645 if (retval) 6624 if (retval)
6646 goto out_unlock; 6625 goto out_unlock;
6647 6626
6627 rq = task_rq_lock(p, &flags);
6648 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6628 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6629 task_rq_unlock(rq, &flags);
6649 6630
6650out_unlock: 6631out_unlock:
6651 read_unlock(&tasklist_lock); 6632 read_unlock(&tasklist_lock);
@@ -6883,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6883{ 6864{
6884 struct task_struct *p; 6865 struct task_struct *p;
6885 unsigned int time_slice; 6866 unsigned int time_slice;
6867 unsigned long flags;
6868 struct rq *rq;
6886 int retval; 6869 int retval;
6887 struct timespec t; 6870 struct timespec t;
6888 6871
@@ -6899,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6899 if (retval) 6882 if (retval)
6900 goto out_unlock; 6883 goto out_unlock;
6901 6884
6902 time_slice = p->sched_class->get_rr_interval(p); 6885 rq = task_rq_lock(p, &flags);
6886 time_slice = p->sched_class->get_rr_interval(rq, p);
6887 task_rq_unlock(rq, &flags);
6903 6888
6904 read_unlock(&tasklist_lock); 6889 read_unlock(&tasklist_lock);
6905 jiffies_to_timespec(time_slice, &t); 6890 jiffies_to_timespec(time_slice, &t);
@@ -7000,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7000 __sched_fork(idle); 6985 __sched_fork(idle);
7001 idle->se.exec_start = sched_clock(); 6986 idle->se.exec_start = sched_clock();
7002 6987
7003 idle->prio = idle->normal_prio = MAX_PRIO;
7004 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6988 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
7005 __set_task_cpu(idle, cpu); 6989 __set_task_cpu(idle, cpu);
7006 6990
@@ -7041,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
7041 * 7025 *
7042 * This idea comes from the SD scheduler of Con Kolivas: 7026 * This idea comes from the SD scheduler of Con Kolivas:
7043 */ 7027 */
7044static inline void sched_init_granularity(void) 7028static int get_update_sysctl_factor(void)
7045{ 7029{
7046 unsigned int factor = 1 + ilog2(num_online_cpus()); 7030 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7047 const unsigned long limit = 200000000; 7031 unsigned int factor;
7032
7033 switch (sysctl_sched_tunable_scaling) {
7034 case SCHED_TUNABLESCALING_NONE:
7035 factor = 1;
7036 break;
7037 case SCHED_TUNABLESCALING_LINEAR:
7038 factor = cpus;
7039 break;
7040 case SCHED_TUNABLESCALING_LOG:
7041 default:
7042 factor = 1 + ilog2(cpus);
7043 break;
7044 }
7048 7045
7049 sysctl_sched_min_granularity *= factor; 7046 return factor;
7050 if (sysctl_sched_min_granularity > limit) 7047}
7051 sysctl_sched_min_granularity = limit;
7052 7048
7053 sysctl_sched_latency *= factor; 7049static void update_sysctl(void)
7054 if (sysctl_sched_latency > limit) 7050{
7055 sysctl_sched_latency = limit; 7051 unsigned int factor = get_update_sysctl_factor();
7056 7052
7057 sysctl_sched_wakeup_granularity *= factor; 7053#define SET_SYSCTL(name) \
7054 (sysctl_##name = (factor) * normalized_sysctl_##name)
7055 SET_SYSCTL(sched_min_granularity);
7056 SET_SYSCTL(sched_latency);
7057 SET_SYSCTL(sched_wakeup_granularity);
7058 SET_SYSCTL(sched_shares_ratelimit);
7059#undef SET_SYSCTL
7060}
7058 7061
7059 sysctl_sched_shares_ratelimit *= factor; 7062static inline void sched_init_granularity(void)
7063{
7064 update_sysctl();
7060} 7065}
7061 7066
7062#ifdef CONFIG_SMP 7067#ifdef CONFIG_SMP
@@ -7093,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7093 int ret = 0; 7098 int ret = 0;
7094 7099
7095 rq = task_rq_lock(p, &flags); 7100 rq = task_rq_lock(p, &flags);
7096 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7101 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7097 ret = -EINVAL; 7102 ret = -EINVAL;
7098 goto out; 7103 goto out;
7099 } 7104 }
@@ -7115,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7115 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7120 if (cpumask_test_cpu(task_cpu(p), new_mask))
7116 goto out; 7121 goto out;
7117 7122
7118 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7123 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7119 /* Need help from migration thread: drop lock and wait. */ 7124 /* Need help from migration thread: drop lock and wait. */
7120 struct task_struct *mt = rq->migration_thread; 7125 struct task_struct *mt = rq->migration_thread;
7121 7126
@@ -7269,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7269 7274
7270again: 7275again:
7271 /* Look for allowed, online CPU in same node. */ 7276 /* Look for allowed, online CPU in same node. */
7272 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7277 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7273 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7278 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7274 goto move; 7279 goto move;
7275 7280
7276 /* Any allowed, online CPU? */ 7281 /* Any allowed, online CPU? */
7277 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7282 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7278 if (dest_cpu < nr_cpu_ids) 7283 if (dest_cpu < nr_cpu_ids)
7279 goto move; 7284 goto move;
7280 7285
7281 /* No more Mr. Nice Guy. */ 7286 /* No more Mr. Nice Guy. */
7282 if (dest_cpu >= nr_cpu_ids) { 7287 if (dest_cpu >= nr_cpu_ids) {
7283 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7288 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7284 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7289 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7285 7290
7286 /* 7291 /*
7287 * Don't tell them about moving exiting tasks or 7292 * Don't tell them about moving exiting tasks or
@@ -7310,7 +7315,7 @@ move:
7310 */ 7315 */
7311static void migrate_nr_uninterruptible(struct rq *rq_src) 7316static void migrate_nr_uninterruptible(struct rq *rq_src)
7312{ 7317{
7313 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7318 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7314 unsigned long flags; 7319 unsigned long flags;
7315 7320
7316 local_irq_save(flags); 7321 local_irq_save(flags);
@@ -7563,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7563static struct ctl_table_header *sd_sysctl_header; 7568static struct ctl_table_header *sd_sysctl_header;
7564static void register_sched_domain_sysctl(void) 7569static void register_sched_domain_sysctl(void)
7565{ 7570{
7566 int i, cpu_num = num_online_cpus(); 7571 int i, cpu_num = num_possible_cpus();
7567 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7572 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7568 char buf[32]; 7573 char buf[32];
7569 7574
@@ -7573,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
7573 if (entry == NULL) 7578 if (entry == NULL)
7574 return; 7579 return;
7575 7580
7576 for_each_online_cpu(i) { 7581 for_each_possible_cpu(i) {
7577 snprintf(buf, 32, "cpu%d", i); 7582 snprintf(buf, 32, "cpu%d", i);
7578 entry->procname = kstrdup(buf, GFP_KERNEL); 7583 entry->procname = kstrdup(buf, GFP_KERNEL);
7579 entry->mode = 0555; 7584 entry->mode = 0555;
@@ -7703,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7703 spin_lock_irq(&rq->lock); 7708 spin_lock_irq(&rq->lock);
7704 update_rq_clock(rq); 7709 update_rq_clock(rq);
7705 deactivate_task(rq, rq->idle, 0); 7710 deactivate_task(rq, rq->idle, 0);
7706 rq->idle->static_prio = MAX_PRIO;
7707 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7711 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7708 rq->idle->sched_class = &idle_sched_class; 7712 rq->idle->sched_class = &idle_sched_class;
7709 migrate_dead_tasks(cpu); 7713 migrate_dead_tasks(cpu);
@@ -9099,7 +9103,7 @@ match1:
9099 if (doms_new == NULL) { 9103 if (doms_new == NULL) {
9100 ndoms_cur = 0; 9104 ndoms_cur = 0;
9101 doms_new = &fallback_doms; 9105 doms_new = &fallback_doms;
9102 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); 9106 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9103 WARN_ON_ONCE(dattr_new); 9107 WARN_ON_ONCE(dattr_new);
9104 } 9108 }
9105 9109
@@ -9230,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9230 switch (action) { 9234 switch (action) {
9231 case CPU_ONLINE: 9235 case CPU_ONLINE:
9232 case CPU_ONLINE_FROZEN: 9236 case CPU_ONLINE_FROZEN:
9233 case CPU_DEAD: 9237 case CPU_DOWN_PREPARE:
9234 case CPU_DEAD_FROZEN: 9238 case CPU_DOWN_PREPARE_FROZEN:
9239 case CPU_DOWN_FAILED:
9240 case CPU_DOWN_FAILED_FROZEN:
9235 partition_sched_domains(1, NULL, NULL); 9241 partition_sched_domains(1, NULL, NULL);
9236 return NOTIFY_OK; 9242 return NOTIFY_OK;
9237 9243
@@ -9278,7 +9284,7 @@ void __init sched_init_smp(void)
9278#endif 9284#endif
9279 get_online_cpus(); 9285 get_online_cpus();
9280 mutex_lock(&sched_domains_mutex); 9286 mutex_lock(&sched_domains_mutex);
9281 arch_init_sched_domains(cpu_online_mask); 9287 arch_init_sched_domains(cpu_active_mask);
9282 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9288 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9283 if (cpumask_empty(non_isolated_cpus)) 9289 if (cpumask_empty(non_isolated_cpus))
9284 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9290 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9842,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9842 se = kzalloc_node(sizeof(struct sched_entity), 9848 se = kzalloc_node(sizeof(struct sched_entity),
9843 GFP_KERNEL, cpu_to_node(i)); 9849 GFP_KERNEL, cpu_to_node(i));
9844 if (!se) 9850 if (!se)
9845 goto err; 9851 goto err_free_rq;
9846 9852
9847 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9853 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9848 } 9854 }
9849 9855
9850 return 1; 9856 return 1;
9851 9857
9858 err_free_rq:
9859 kfree(cfs_rq);
9852 err: 9860 err:
9853 return 0; 9861 return 0;
9854} 9862}
@@ -9930,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9930 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9938 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9931 GFP_KERNEL, cpu_to_node(i)); 9939 GFP_KERNEL, cpu_to_node(i));
9932 if (!rt_se) 9940 if (!rt_se)
9933 goto err; 9941 goto err_free_rq;
9934 9942
9935 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9943 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9936 } 9944 }
9937 9945
9938 return 1; 9946 return 1;
9939 9947
9948 err_free_rq:
9949 kfree(rt_rq);
9940 err: 9950 err:
9941 return 0; 9951 return 0;
9942} 9952}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f705..5ae24fc65d75 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
309 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
310} 310}
311 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
312static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
313{ 319{
314 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
334#undef PN 340#undef PN
335#undef P 341#undef P
336 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
337 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
338 print_cpu(m, cpu); 348 print_cpu(m, cpu);
339 349
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
399 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
400 PN(se.avg_overlap); 410 PN(se.avg_overlap);
401 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
402 PN(se.avg_running);
403 412
404 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
405 414
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
423 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
424 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
425 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
426 P(se.nr_forced2_migrations);
427 P(se.nr_wakeups); 435 P(se.nr_wakeups);
428 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
429 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
499 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
500 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
501 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
502 p->se.nr_forced2_migrations = 0;
503 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
504 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
505 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336d..804a411838f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -1403,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1404 } 1429 }
1405 1430
1406 rcu_read_lock();
1407 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1408 /* 1432 /*
1409 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1484,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1484 update_shares(tmp); 1508 update_shares(tmp);
1485 } 1509 }
1486 1510
1487 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1488 new_cpu = cpu; 1512 return cpu;
1489 goto out;
1490 }
1491 1513
1492 while (sd) { 1514 while (sd) {
1493 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1528,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1528 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1529 } 1551 }
1530 1552
1531out:
1532 rcu_read_unlock();
1533 return new_cpu; 1553 return new_cpu;
1534} 1554}
1535#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1651,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1651 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency; 1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1653 1673
1654 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1655 1675 goto preempt;
1656 if (unlikely(rt_prio(p->prio))) {
1657 resched_task(curr);
1658 return;
1659 }
1660 1676
1661 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1662 return; 1678 return;
@@ -1682,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1682 return; 1698 return;
1683 1699
1684 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1685 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1686 resched_task(curr); 1702 goto preempt;
1687 return;
1688 }
1689 1703
1690 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1691 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1692 (se->avg_overlap < sysctl_sched_migration_cost &&
1693 pse->avg_overlap < sysctl_sched_migration_cost))) {
1694 resched_task(curr);
1695 return;
1696 }
1697 1706
1698 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1699 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1700 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1701 resched_task(curr); 1710 goto preempt;
1702 return;
1703 }
1704 }
1705 1711
1706 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1707 return; 1713 return;
1708 1714
1715 update_curr(cfs_rq);
1709 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1710
1711 BUG_ON(!pse); 1717 BUG_ON(!pse);
1718 if (wakeup_preempt_entity(se, pse) == 1)
1719 goto preempt;
1712 1720
1713 if (wakeup_preempt_entity(se, pse) == 1) { 1721 return;
1714 resched_task(curr); 1722
1715 /* 1723preempt:
1716 * Only set the backward buddy when the current task is still 1724 resched_task(curr);
1717 * on the rq. This can happen when a wakeup gets interleaved 1725 /*
1718 * with schedule on the ->pre_schedule() or idle_balance() 1726 * Only set the backward buddy when the current task is still
1719 * point, either of which can * drop the rq lock. 1727 * on the rq. This can happen when a wakeup gets interleaved
1720 * 1728 * with schedule on the ->pre_schedule() or idle_balance()
1721 * Also, during early boot the idle thread is in the fair class, 1729 * point, either of which can * drop the rq lock.
1722 * for obvious reasons its a bad idea to schedule back to it. 1730 *
1723 */ 1731 * Also, during early boot the idle thread is in the fair class,
1724 if (unlikely(!se->on_rq || curr == rq->idle)) 1732 * for obvious reasons its a bad idea to schedule back to it.
1725 return; 1733 */
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1734 if (unlikely(!se->on_rq || curr == rq->idle))
1727 set_last_buddy(se); 1735 return;
1728 } 1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1729} 1739}
1730 1740
1731static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1905 1915
1906 return 0; 1916 return 0;
1907} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1908#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1909 1930
1910/* 1931/*
@@ -1922,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1922} 1943}
1923 1944
1924/* 1945/*
1925 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1926 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1927 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1928 * monopolize the CPU. Note: the parent runqueue is locked,
1929 * the child is not running yet.
1930 */ 1949 */
1931static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1932{ 1951{
1933 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1934 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1935 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 spin_lock_irqsave(&rq->lock, flags);
1936 1959
1937 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1938 1962
1939 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1940 if (curr) 1965 if (curr)
1941 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1942 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1943 1968
1944 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1945 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1946 curr && entity_before(curr, se)) {
1947 /* 1970 /*
1948 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1949 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1952,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1952 resched_task(rq->curr); 1975 resched_task(rq->curr);
1953 } 1976 }
1954 1977
1955 enqueue_task_fair(rq, p, 0); 1978 spin_unlock_irqrestore(&rq->lock, flags);
1956} 1979}
1957 1980
1958/* 1981/*
@@ -2014,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
2014} 2037}
2015#endif 2038#endif
2016 2039
2017unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2018{ 2041{
2019 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
2020 unsigned long flags;
2021 struct rq *rq;
2022 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
2023 2044
2024 /* 2045 /*
2025 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2026 * idle runqueue: 2047 * idle runqueue:
2027 */ 2048 */
2028 rq = task_rq_lock(task, &flags);
2029 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
2030 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
2031 task_rq_unlock(rq, &flags);
2032 2051
2033 return rr_interval; 2052 return rr_interval;
2034} 2053}
@@ -2052,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
2052 2071
2053 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
2054 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
2055#endif 2076#endif
2056 2077
2057 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
2058 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
2059 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
2060 2081
2061 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
2062 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..33d5384a73a8 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef378415..aecbd9c6b20c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1722} 1722}
1723 1723
1724unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1725{
1726 /* 1726 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b9..585d6cd10040 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -349,6 +348,9 @@ void kernel_power_off(void)
349 machine_power_off(); 348 machine_power_off();
350} 349}
351EXPORT_SYMBOL_GPL(kernel_power_off); 350EXPORT_SYMBOL_GPL(kernel_power_off);
351
352static DEFINE_MUTEX(reboot_mutex);
353
352/* 354/*
353 * Reboot system call: for obvious reasons only root may call it, 355 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 356 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 383 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 384 cmd = LINUX_REBOOT_CMD_HALT;
383 385
384 lock_kernel(); 386 mutex_lock(&reboot_mutex);
385 switch (cmd) { 387 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 388 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 389 kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 399
398 case LINUX_REBOOT_CMD_HALT: 400 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 401 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 402 do_exit(0);
402 panic("cannot halt"); 403 panic("cannot halt");
403 404
404 case LINUX_REBOOT_CMD_POWER_OFF: 405 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 406 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 407 do_exit(0);
408 break; 408 break;
409 409
410 case LINUX_REBOOT_CMD_RESTART2: 410 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 412 ret = -EFAULT;
413 return -EFAULT; 413 break;
414 } 414 }
415 buffer[sizeof(buffer) - 1] = '\0'; 415 buffer[sizeof(buffer) - 1] = '\0';
416 416
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 433 ret = -EINVAL;
434 break; 434 break;
435 } 435 }
436 unlock_kernel(); 436 mutex_unlock(&reboot_mutex);
437 return ret; 437 return ret;
438} 438}
439 439
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c5..554ac4894f0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
245static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
247#endif 251#endif
248 252
249static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
260 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
261 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
262 .mode = 0644, 266 .mode = 0644,
263 .proc_handler = sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
264 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
265 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
266 }, 270 },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
269 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
270 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
271 .mode = 0644, 275 .mode = 0644,
272 .proc_handler = sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
273 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
274 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
275 }, 279 },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
278 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
279 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
280 .mode = 0644, 284 .mode = 0644,
281 .proc_handler = proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
282 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
283 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
284 }, 288 },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
287 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
288 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
289 .mode = 0644, 293 .mode = 0644,
290 .proc_handler = proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
297 },
298 {
299 .procname = "sched_tunable_scaling",
300 .data = &sysctl_sched_tunable_scaling,
301 .maxlen = sizeof(enum sched_tunable_scaling),
302 .mode = 0644,
303 .proc_handler = sched_proc_update_handler,
304 .extra1 = &min_sched_tunable_scaling,
305 .extra2 = &max_sched_tunable_scaling,
291 }, 306 },
292 { 307 {
293 .procname = "sched_shares_thresh", 308 .procname = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
298 .extra1 = &zero, 313 .extra1 = &zero,
299 }, 314 },
300 { 315 {
301 .procname = "sched_features",
302 .data = &sysctl_sched_features,
303 .maxlen = sizeof(unsigned int),
304 .mode = 0644,
305 .proc_handler = proc_dointvec,
306 },
307 {
308 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
309 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
310 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d422c7b2236b..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -677,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
677 * @count: length of buffer 677 * @count: length of buffer
678 * 678 *
679 * Takes input from sysfs interface for manually overriding the default 679 * Takes input from sysfs interface for manually overriding the default
680 * clocksource selction. 680 * clocksource selection.
681 */ 681 */
682static ssize_t sysfs_override_clocksource(struct sys_device *dev, 682static ssize_t sysfs_override_clocksource(struct sys_device *dev,
683 struct sysdev_attribute *attr, 683 struct sysdev_attribute *attr,
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 665c76edbf17..9d80db4747d4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
254 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
255 int cpu; 258 int cpu;
256 259
257 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
258 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
259 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
260 263
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ed223447a3f..7ecab06547a5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv)
606 */ 606 */
607 struct trace_probe *tp; 607 struct trace_probe *tp;
608 int i, ret = 0; 608 int i, ret = 0;
609 int is_return = 0; 609 int is_return = 0, is_delete = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0; 611 unsigned long offset = 0;
612 void *addr = NULL; 612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN]; 613 char buf[MAX_EVENT_NAME_LEN];
614 614
615 if (argc < 2) { 615 /* argc must be >= 1 */
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p') 616 if (argv[0][0] == 'p')
621 is_return = 0; 617 is_return = 0;
622 else if (argv[0][0] == 'r') 618 else if (argv[0][0] == 'r')
623 is_return = 1; 619 is_return = 1;
620 else if (argv[0][0] == '-')
621 is_delete = 1;
624 else { 622 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n"); 623 pr_info("Probe definition must be started with 'p', 'r' or"
624 " '-'.\n");
626 return -EINVAL; 625 return -EINVAL;
627 } 626 }
628 627
@@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv)
642 return -EINVAL; 641 return -EINVAL;
643 } 642 }
644 } 643 }
644 if (!group)
645 group = KPROBE_EVENT_SYSTEM;
645 646
647 if (is_delete) {
648 if (!event) {
649 pr_info("Delete command needs an event name.\n");
650 return -EINVAL;
651 }
652 tp = find_probe_event(event, group);
653 if (!tp) {
654 pr_info("Event %s/%s doesn't exist.\n", group, event);
655 return -ENOENT;
656 }
657 /* delete an event */
658 unregister_trace_probe(tp);
659 free_trace_probe(tp);
660 return 0;
661 }
662
663 if (argc < 2) {
664 pr_info("Probe point is not specified.\n");
665 return -EINVAL;
666 }
646 if (isdigit(argv[1][0])) { 667 if (isdigit(argv[1][0])) {
647 if (is_return) { 668 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n"); 669 pr_info("Return probe point must be a symbol.\n");
@@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv)
671 argc -= 2; argv += 2; 692 argc -= 2; argv += 2;
672 693
673 /* setup a probe */ 694 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) { 695 if (!event) {
677 /* Make a new event name */ 696 /* Make a new event name */
678 if (symbol) 697 if (symbol)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 48f1c6c248c6..faf37fa4408c 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
79} 79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */ 80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81 81
82void ksym_hbp_handler(struct perf_event *hbp, void *data) 82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
83{ 85{
84 struct ring_buffer_event *event; 86 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry; 87 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer; 88 struct ring_buffer *buffer;
88 int pc; 89 int pc;
89 90
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)