aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/kernel/process.c2
-rw-r--r--arch/powerpc/kernel/time.c25
-rw-r--r--arch/s390/kernel/time.c4
-rw-r--r--arch/s390/kernel/vtime.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/x86/kernel/nmi_32.c4
-rw-r--r--include/linux/sched.h17
-rw-r--r--include/linux/smp.h7
-rw-r--r--init/main.c4
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/sched.c59
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c96
-rw-r--r--kernel/sched_stats.h11
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/timer.c21
16 files changed, 172 insertions, 123 deletions
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b9d88374f14f..41e13f4cc6e3 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -350,7 +350,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
350 local_irq_save(flags); 350 local_irq_save(flags);
351 351
352 account_system_vtime(current); 352 account_system_vtime(current);
353 account_process_vtime(current); 353 account_process_tick(current, 0);
354 calculate_steal_time(); 354 calculate_steal_time();
355 355
356 last = _switch(old_thread, new_thread); 356 last = _switch(old_thread, new_thread);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 99ebcd3884d2..4beb6329dfb7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -259,7 +259,7 @@ void account_system_vtime(struct task_struct *tsk)
259 * user and system time records. 259 * user and system time records.
260 * Must be called with interrupts disabled. 260 * Must be called with interrupts disabled.
261 */ 261 */
262void account_process_vtime(struct task_struct *tsk) 262void account_process_tick(struct task_struct *tsk, int user_tick)
263{ 263{
264 cputime_t utime, utimescaled; 264 cputime_t utime, utimescaled;
265 265
@@ -274,18 +274,6 @@ void account_process_vtime(struct task_struct *tsk)
274 account_user_time_scaled(tsk, utimescaled); 274 account_user_time_scaled(tsk, utimescaled);
275} 275}
276 276
277static void account_process_time(struct pt_regs *regs)
278{
279 int cpu = smp_processor_id();
280
281 account_process_vtime(current);
282 run_local_timers();
283 if (rcu_pending(cpu))
284 rcu_check_callbacks(cpu, user_mode(regs));
285 scheduler_tick();
286 run_posix_cpu_timers(current);
287}
288
289/* 277/*
290 * Stuff for accounting stolen time. 278 * Stuff for accounting stolen time.
291 */ 279 */
@@ -375,7 +363,6 @@ static void snapshot_purr(void)
375 363
376#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ 364#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
377#define calc_cputime_factors() 365#define calc_cputime_factors()
378#define account_process_time(regs) update_process_times(user_mode(regs))
379#define calculate_steal_time() do { } while (0) 366#define calculate_steal_time() do { } while (0)
380#endif 367#endif
381 368
@@ -599,16 +586,6 @@ void timer_interrupt(struct pt_regs * regs)
599 get_lppaca()->int_dword.fields.decr_int = 0; 586 get_lppaca()->int_dword.fields.decr_int = 0;
600#endif 587#endif
601 588
602 /*
603 * We cannot disable the decrementer, so in the period
604 * between this cpu's being marked offline in cpu_online_map
605 * and calling stop-self, it is taking timer interrupts.
606 * Avoid calling into the scheduler rebalancing code if this
607 * is the case.
608 */
609 if (!cpu_is_offline(cpu))
610 account_process_time(regs);
611
612 if (evt->event_handler) 589 if (evt->event_handler)
613 evt->event_handler(evt); 590 evt->event_handler(evt);
614 591
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a963fe81359e..22b800ce2126 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -145,12 +145,8 @@ void account_ticks(u64 time)
145 do_timer(ticks); 145 do_timer(ticks);
146#endif 146#endif
147 147
148#ifdef CONFIG_VIRT_CPU_ACCOUNTING
149 account_tick_vtime(current);
150#else
151 while (ticks--) 148 while (ticks--)
152 update_process_times(user_mode(get_irq_regs())); 149 update_process_times(user_mode(get_irq_regs()));
153#endif
154 150
155 s390_do_profile(); 151 s390_do_profile();
156} 152}
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 84ff78de6bac..c5f05b3fb2c3 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
32 * Update process times based on virtual cpu times stored by entry.S 32 * Update process times based on virtual cpu times stored by entry.S
33 * to the lowcore fields user_timer, system_timer & steal_clock. 33 * to the lowcore fields user_timer, system_timer & steal_clock.
34 */ 34 */
35void account_tick_vtime(struct task_struct *tsk) 35void account_process_tick(struct task_struct *tsk, int user_tick)
36{ 36{
37 cputime_t cputime; 37 cputime_t cputime;
38 __u64 timer, clock; 38 __u64 timer, clock;
@@ -64,12 +64,6 @@ void account_tick_vtime(struct task_struct *tsk)
64 S390_lowcore.steal_clock -= cputime << 12; 64 S390_lowcore.steal_clock -= cputime << 12;
65 account_steal_time(tsk, cputime); 65 account_steal_time(tsk, cputime);
66 } 66 }
67
68 run_local_timers();
69 if (rcu_pending(smp_processor_id()))
70 rcu_check_callbacks(smp_processor_id(), rcu_user_flag);
71 scheduler_tick();
72 run_posix_cpu_timers(tsk);
73} 67}
74 68
75/* 69/*
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 9abbdf7562c5..3b20613325dc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -139,13 +139,12 @@ struct set_mtrr_data {
139 mtrr_type smp_type; 139 mtrr_type smp_type;
140}; 140};
141 141
142#ifdef CONFIG_SMP
143
144static void ipi_handler(void *info) 142static void ipi_handler(void *info)
145/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. 143/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
146 [RETURNS] Nothing. 144 [RETURNS] Nothing.
147*/ 145*/
148{ 146{
147#ifdef CONFIG_SMP
149 struct set_mtrr_data *data = info; 148 struct set_mtrr_data *data = info;
150 unsigned long flags; 149 unsigned long flags;
151 150
@@ -168,9 +167,8 @@ static void ipi_handler(void *info)
168 167
169 atomic_dec(&data->count); 168 atomic_dec(&data->count);
170 local_irq_restore(flags); 169 local_irq_restore(flags);
171}
172
173#endif 170#endif
171}
174 172
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) { 173static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE || 174 return type1 == MTRR_TYPE_UNCACHABLE ||
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index f803ed0ed1c4..600fd404e440 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
51 51
52static int endflag __initdata = 0; 52static int endflag __initdata = 0;
53 53
54#ifdef CONFIG_SMP
55/* The performance counters used by NMI_LOCAL_APIC don't trigger when 54/* The performance counters used by NMI_LOCAL_APIC don't trigger when
56 * the CPU is idle. To make sure the NMI watchdog really ticks on all 55 * the CPU is idle. To make sure the NMI watchdog really ticks on all
57 * CPUs during the test make them busy. 56 * CPUs during the test make them busy.
58 */ 57 */
59static __init void nmi_cpu_busy(void *data) 58static __init void nmi_cpu_busy(void *data)
60{ 59{
60#ifdef CONFIG_SMP
61 local_irq_enable_in_hardirq(); 61 local_irq_enable_in_hardirq();
62 /* Intentionally don't use cpu_relax here. This is 62 /* Intentionally don't use cpu_relax here. This is
63 to make sure that the performance counter really ticks, 63 to make sure that the performance counter really ticks,
@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
67 care if they get somewhat less cycles. */ 67 care if they get somewhat less cycles. */
68 while (endflag == 0) 68 while (endflag == 0)
69 mb(); 69 mb();
70}
71#endif 70#endif
71}
72 72
73static int __init check_nmi_watchdog(void) 73static int __init check_nmi_watchdog(void)
74{ 74{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 155d7438f7ad..ee800e7a70de 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,7 @@ long io_schedule_timeout(long timeout);
254 254
255extern void cpu_init (void); 255extern void cpu_init (void);
256extern void trap_init(void); 256extern void trap_init(void);
257extern void account_process_tick(struct task_struct *task, int user);
257extern void update_process_times(int user); 258extern void update_process_times(int user);
258extern void scheduler_tick(void); 259extern void scheduler_tick(void);
259 260
@@ -862,7 +863,6 @@ struct sched_entity {
862 struct load_weight load; /* for load-balancing */ 863 struct load_weight load; /* for load-balancing */
863 struct rb_node run_node; 864 struct rb_node run_node;
864 unsigned int on_rq; 865 unsigned int on_rq;
865 int peer_preempt;
866 866
867 u64 exec_start; 867 u64 exec_start;
868 u64 sum_exec_runtime; 868 u64 sum_exec_runtime;
@@ -1460,12 +1460,17 @@ extern void sched_idle_next(void);
1460 1460
1461#ifdef CONFIG_SCHED_DEBUG 1461#ifdef CONFIG_SCHED_DEBUG
1462extern unsigned int sysctl_sched_latency; 1462extern unsigned int sysctl_sched_latency;
1463extern unsigned int sysctl_sched_nr_latency; 1463extern unsigned int sysctl_sched_min_granularity;
1464extern unsigned int sysctl_sched_wakeup_granularity; 1464extern unsigned int sysctl_sched_wakeup_granularity;
1465extern unsigned int sysctl_sched_batch_wakeup_granularity; 1465extern unsigned int sysctl_sched_batch_wakeup_granularity;
1466extern unsigned int sysctl_sched_child_runs_first; 1466extern unsigned int sysctl_sched_child_runs_first;
1467extern unsigned int sysctl_sched_features; 1467extern unsigned int sysctl_sched_features;
1468extern unsigned int sysctl_sched_migration_cost; 1468extern unsigned int sysctl_sched_migration_cost;
1469extern unsigned int sysctl_sched_nr_migrate;
1470
1471int sched_nr_latency_handler(struct ctl_table *table, int write,
1472 struct file *file, void __user *buffer, size_t *length,
1473 loff_t *ppos);
1469#endif 1474#endif
1470 1475
1471extern unsigned int sysctl_sched_compat_yield; 1476extern unsigned int sysctl_sched_compat_yield;
@@ -1983,6 +1988,14 @@ static inline void inc_syscw(struct task_struct *tsk)
1983} 1988}
1984#endif 1989#endif
1985 1990
1991#ifdef CONFIG_SMP
1992void migration_init(void);
1993#else
1994static inline void migration_init(void)
1995{
1996}
1997#endif
1998
1986#endif /* __KERNEL__ */ 1999#endif /* __KERNEL__ */
1987 2000
1988#endif 2001#endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 259a13c3bd98..c25e66bcecf3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -84,11 +84,12 @@ void smp_prepare_boot_cpu(void);
84 * These macros fold the SMP functionality into a single CPU system 84 * These macros fold the SMP functionality into a single CPU system
85 */ 85 */
86#define raw_smp_processor_id() 0 86#define raw_smp_processor_id() 0
87static inline int up_smp_call_function(void) 87static inline int up_smp_call_function(void (*func)(void *), void *info)
88{ 88{
89 return 0; 89 return 0;
90} 90}
91#define smp_call_function(func,info,retry,wait) (up_smp_call_function()) 91#define smp_call_function(func, info, retry, wait) \
92 (up_smp_call_function(func, info))
92#define on_each_cpu(func,info,retry,wait) \ 93#define on_each_cpu(func,info,retry,wait) \
93 ({ \ 94 ({ \
94 local_irq_disable(); \ 95 local_irq_disable(); \
@@ -107,6 +108,8 @@ static inline void smp_send_reschedule(int cpu) { }
107 local_irq_enable(); \ 108 local_irq_enable(); \
108 0; \ 109 0; \
109}) 110})
111#define smp_call_function_mask(mask, func, info, wait) \
112 (up_smp_call_function(func, info))
110 113
111#endif /* !SMP */ 114#endif /* !SMP */
112 115
diff --git a/init/main.c b/init/main.c
index f605a969ea61..80b04b6c5157 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
56#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
57#include <linux/device.h> 57#include <linux/device.h>
58#include <linux/kthread.h> 58#include <linux/kthread.h>
59#include <linux/sched.h>
59 60
60#include <asm/io.h> 61#include <asm/io.h>
61#include <asm/bugs.h> 62#include <asm/bugs.h>
@@ -747,11 +748,8 @@ __setup("nosoftlockup", nosoftlockup_setup);
747static void __init do_pre_smp_initcalls(void) 748static void __init do_pre_smp_initcalls(void)
748{ 749{
749 extern int spawn_ksoftirqd(void); 750 extern int spawn_ksoftirqd(void);
750#ifdef CONFIG_SMP
751 extern int migration_init(void);
752 751
753 migration_init(); 752 migration_init();
754#endif
755 spawn_ksoftirqd(); 753 spawn_ksoftirqd();
756 if (!nosoftlockup) 754 if (!nosoftlockup)
757 spawn_softlockup_task(); 755 spawn_softlockup_task();
diff --git a/kernel/fork.c b/kernel/fork.c
index 28a740151988..8ca1a14cdc8c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1123 p->blocked_on = NULL; /* not blocked yet */ 1123 p->blocked_on = NULL; /* not blocked yet */
1124#endif 1124#endif
1125 1125
1126 /* Perform scheduler related setup. Assign this task to a CPU. */
1127 sched_fork(p, clone_flags);
1128
1126 if ((retval = security_task_alloc(p))) 1129 if ((retval = security_task_alloc(p)))
1127 goto bad_fork_cleanup_policy; 1130 goto bad_fork_cleanup_policy;
1128 if ((retval = audit_alloc(p))) 1131 if ((retval = audit_alloc(p)))
@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1212 INIT_LIST_HEAD(&p->ptrace_children); 1215 INIT_LIST_HEAD(&p->ptrace_children);
1213 INIT_LIST_HEAD(&p->ptrace_list); 1216 INIT_LIST_HEAD(&p->ptrace_list);
1214 1217
1215 /* Perform scheduler related setup. Assign this task to a CPU. */
1216 sched_fork(p, clone_flags);
1217
1218 /* Now that the task is set up, run cgroup callbacks if 1218 /* Now that the task is set up, run cgroup callbacks if
1219 * necessary. We need to run them before the task is visible 1219 * necessary. We need to run them before the task is visible
1220 * on the tasklist. */ 1220 * on the tasklist. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd1112900..b18f231a4875 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,7 +75,7 @@
75 */ 75 */
76unsigned long long __attribute__((weak)) sched_clock(void) 76unsigned long long __attribute__((weak)) sched_clock(void)
77{ 77{
78 return (unsigned long long)jiffies * (1000000000 / HZ); 78 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
79} 79}
80 80
81/* 81/*
@@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
99/* 99/*
100 * Some helpers for converting nanosecond timing to jiffy resolution 100 * Some helpers for converting nanosecond timing to jiffy resolution
101 */ 101 */
102#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) 102#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
103#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 103#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
104 104
105#define NICE_0_LOAD SCHED_LOAD_SCALE 105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT 106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -460,7 +460,6 @@ enum {
460 SCHED_FEAT_TREE_AVG = 4, 460 SCHED_FEAT_TREE_AVG = 4,
461 SCHED_FEAT_APPROX_AVG = 8, 461 SCHED_FEAT_APPROX_AVG = 8,
462 SCHED_FEAT_WAKEUP_PREEMPT = 16, 462 SCHED_FEAT_WAKEUP_PREEMPT = 16,
463 SCHED_FEAT_PREEMPT_RESTRICT = 32,
464}; 463};
465 464
466const_debug unsigned int sysctl_sched_features = 465const_debug unsigned int sysctl_sched_features =
@@ -468,12 +467,17 @@ const_debug unsigned int sysctl_sched_features =
468 SCHED_FEAT_START_DEBIT * 1 | 467 SCHED_FEAT_START_DEBIT * 1 |
469 SCHED_FEAT_TREE_AVG * 0 | 468 SCHED_FEAT_TREE_AVG * 0 |
470 SCHED_FEAT_APPROX_AVG * 0 | 469 SCHED_FEAT_APPROX_AVG * 0 |
471 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 470 SCHED_FEAT_WAKEUP_PREEMPT * 1;
472 SCHED_FEAT_PREEMPT_RESTRICT * 1;
473 471
474#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 472#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
475 473
476/* 474/*
475 * Number of tasks to iterate in a single balance run.
476 * Limited because this is done with IRQs disabled.
477 */
478const_debug unsigned int sysctl_sched_nr_migrate = 32;
479
480/*
477 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 481 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
478 * clock constructed from sched_clock(): 482 * clock constructed from sched_clock():
479 */ 483 */
@@ -2237,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2237 enum cpu_idle_type idle, int *all_pinned, 2241 enum cpu_idle_type idle, int *all_pinned,
2238 int *this_best_prio, struct rq_iterator *iterator) 2242 int *this_best_prio, struct rq_iterator *iterator)
2239{ 2243{
2240 int pulled = 0, pinned = 0, skip_for_load; 2244 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2241 struct task_struct *p; 2245 struct task_struct *p;
2242 long rem_load_move = max_load_move; 2246 long rem_load_move = max_load_move;
2243 2247
@@ -2251,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2251 */ 2255 */
2252 p = iterator->start(iterator->arg); 2256 p = iterator->start(iterator->arg);
2253next: 2257next:
2254 if (!p) 2258 if (!p || loops++ > sysctl_sched_nr_migrate)
2255 goto out; 2259 goto out;
2256 /* 2260 /*
2257 * To help distribute high priority tasks accross CPUs we don't 2261 * To help distribute high priority tasks across CPUs we don't
2258 * skip a task if it will be the highest priority task (i.e. smallest 2262 * skip a task if it will be the highest priority task (i.e. smallest
2259 * prio value) on its new queue regardless of its load weight 2263 * prio value) on its new queue regardless of its load weight
2260 */ 2264 */
@@ -2271,8 +2275,7 @@ next:
2271 rem_load_move -= p->se.load.weight; 2275 rem_load_move -= p->se.load.weight;
2272 2276
2273 /* 2277 /*
2274 * We only want to steal up to the prescribed number of tasks 2278 * We only want to steal up to the prescribed amount of weighted load.
2275 * and the prescribed amount of weighted load.
2276 */ 2279 */
2277 if (rem_load_move > 0) { 2280 if (rem_load_move > 0) {
2278 if (p->prio < *this_best_prio) 2281 if (p->prio < *this_best_prio)
@@ -4992,6 +4995,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 */ 4995 */
4993cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4996cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4994 4997
4998/*
4999 * Increase the granularity value when there are more CPUs,
5000 * because with more CPUs the 'effective latency' as visible
5001 * to users decreases. But the relationship is not linear,
5002 * so pick a second-best guess by going with the log2 of the
5003 * number of CPUs.
5004 *
5005 * This idea comes from the SD scheduler of Con Kolivas:
5006 */
5007static inline void sched_init_granularity(void)
5008{
5009 unsigned int factor = 1 + ilog2(num_online_cpus());
5010 const unsigned long limit = 200000000;
5011
5012 sysctl_sched_min_granularity *= factor;
5013 if (sysctl_sched_min_granularity > limit)
5014 sysctl_sched_min_granularity = limit;
5015
5016 sysctl_sched_latency *= factor;
5017 if (sysctl_sched_latency > limit)
5018 sysctl_sched_latency = limit;
5019
5020 sysctl_sched_wakeup_granularity *= factor;
5021 sysctl_sched_batch_wakeup_granularity *= factor;
5022}
5023
4995#ifdef CONFIG_SMP 5024#ifdef CONFIG_SMP
4996/* 5025/*
4997 * This is how migration works: 5026 * This is how migration works:
@@ -5621,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
5621 .priority = 10 5650 .priority = 10
5622}; 5651};
5623 5652
5624int __init migration_init(void) 5653void __init migration_init(void)
5625{ 5654{
5626 void *cpu = (void *)(long)smp_processor_id(); 5655 void *cpu = (void *)(long)smp_processor_id();
5627 int err; 5656 int err;
@@ -5631,8 +5660,6 @@ int __init migration_init(void)
5631 BUG_ON(err == NOTIFY_BAD); 5660 BUG_ON(err == NOTIFY_BAD);
5632 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5661 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5633 register_cpu_notifier(&migration_notifier); 5662 register_cpu_notifier(&migration_notifier);
5634
5635 return 0;
5636} 5663}
5637#endif 5664#endif
5638 5665
@@ -6688,10 +6715,12 @@ void __init sched_init_smp(void)
6688 /* Move init over to a non-isolated CPU */ 6715 /* Move init over to a non-isolated CPU */
6689 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6716 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6690 BUG(); 6717 BUG();
6718 sched_init_granularity();
6691} 6719}
6692#else 6720#else
6693void __init sched_init_smp(void) 6721void __init sched_init_smp(void)
6694{ 6722{
6723 sched_init_granularity();
6695} 6724}
6696#endif /* CONFIG_SMP */ 6725#endif /* CONFIG_SMP */
6697 6726
@@ -7228,7 +7257,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
7228 spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); 7257 spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
7229 } 7258 }
7230 /* Convert from ns to ms */ 7259 /* Convert from ns to ms */
7231 do_div(res, 1000000); 7260 do_div(res, NSEC_PER_MSEC);
7232 7261
7233 return res; 7262 return res;
7234} 7263}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 415e5c385542..ca198a797bfa 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
211#define PN(x) \ 211#define PN(x) \
212 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 212 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
213 PN(sysctl_sched_latency); 213 PN(sysctl_sched_latency);
214 PN(sysctl_sched_nr_latency); 214 PN(sysctl_sched_min_granularity);
215 PN(sysctl_sched_wakeup_granularity); 215 PN(sysctl_sched_wakeup_granularity);
216 PN(sysctl_sched_batch_wakeup_granularity); 216 PN(sysctl_sched_batch_wakeup_granularity);
217 PN(sysctl_sched_child_runs_first); 217 PN(sysctl_sched_child_runs_first);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f662ab7..d3c03070872d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * Targeted preemption latency for CPU-bound tasks: 24 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms * ilog(ncpus), units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length 28 * 'timeslice length' - timeslices in CFS are of variable length
@@ -32,19 +32,24 @@
32 * (to see the precise effective timeslice length of your workload, 32 * (to see the precise effective timeslice length of your workload,
33 * run vmstat and monitor the context-switches (cs) field) 33 * run vmstat and monitor the context-switches (cs) field)
34 */ 34 */
35const_debug unsigned int sysctl_sched_latency = 20000000ULL; 35unsigned int sysctl_sched_latency = 20000000ULL;
36 36
37/* 37/*
38 * After fork, child runs first. (default) If set to 0 then 38 * Minimal preemption granularity for CPU-bound tasks:
39 * parent will (try to) run first. 39 * (default: 1 msec * ilog(ncpus), units: nanoseconds)
40 */ 40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1; 41unsigned int sysctl_sched_min_granularity = 1000000ULL;
42 42
43/* 43/*
44 * Minimal preemption granularity for CPU-bound tasks: 44 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
45 * (default: 2 msec, units: nanoseconds) 45 */
46unsigned int sched_nr_latency = 20;
47
48/*
49 * After fork, child runs first. (default) If set to 0 then
50 * parent will (try to) run first.
46 */ 51 */
47const_debug unsigned int sysctl_sched_nr_latency = 20; 52const_debug unsigned int sysctl_sched_child_runs_first = 1;
48 53
49/* 54/*
50 * sys_sched_yield() compat mode 55 * sys_sched_yield() compat mode
@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
56 61
57/* 62/*
58 * SCHED_BATCH wake-up granularity. 63 * SCHED_BATCH wake-up granularity.
59 * (default: 10 msec, units: nanoseconds) 64 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
60 * 65 *
61 * This option delays the preemption effects of decoupled workloads 66 * This option delays the preemption effects of decoupled workloads
62 * and reduces their over-scheduling. Synchronous workloads will still 67 * and reduces their over-scheduling. Synchronous workloads will still
63 * have immediate wakeup/sleep latencies. 68 * have immediate wakeup/sleep latencies.
64 */ 69 */
65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; 70unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
66 71
67/* 72/*
68 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
69 * (default: 10 msec, units: nanoseconds) 74 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
70 * 75 *
71 * This option delays the preemption effects of decoupled workloads 76 * This option delays the preemption effects of decoupled workloads
72 * and reduces their over-scheduling. Synchronous workloads will still 77 * and reduces their over-scheduling. Synchronous workloads will still
73 * have immediate wakeup/sleep latencies. 78 * have immediate wakeup/sleep latencies.
74 */ 79 */
75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 80unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
76 81
77const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 82const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
78 83
@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
212 * Scheduling class statistics methods: 217 * Scheduling class statistics methods:
213 */ 218 */
214 219
220#ifdef CONFIG_SCHED_DEBUG
221int sched_nr_latency_handler(struct ctl_table *table, int write,
222 struct file *filp, void __user *buffer, size_t *lenp,
223 loff_t *ppos)
224{
225 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
226
227 if (ret || !write)
228 return ret;
229
230 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
231 sysctl_sched_min_granularity);
232
233 return 0;
234}
235#endif
215 236
216/* 237/*
217 * The idea is to set a period in which each task runs once. 238 * The idea is to set a period in which each task runs once.
@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
224static u64 __sched_period(unsigned long nr_running) 245static u64 __sched_period(unsigned long nr_running)
225{ 246{
226 u64 period = sysctl_sched_latency; 247 u64 period = sysctl_sched_latency;
227 unsigned long nr_latency = sysctl_sched_nr_latency; 248 unsigned long nr_latency = sched_nr_latency;
228 249
229 if (unlikely(nr_running > nr_latency)) { 250 if (unlikely(nr_running > nr_latency)) {
230 period *= nr_running; 251 period *= nr_running;
@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
259{ 280{
260 u64 vslice = __sched_period(nr_running); 281 u64 vslice = __sched_period(nr_running);
261 282
283 vslice *= NICE_0_LOAD;
262 do_div(vslice, rq_weight); 284 do_div(vslice, rq_weight);
263 285
264 return vslice; 286 return vslice;
@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) 494 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2; 495 vruntime += sched_vslice(cfs_rq)/2;
474 496
497 /*
498 * The 'current' period is already promised to the current tasks,
499 * however the extra weight of the new task will slow them down a
500 * little, place the new task so that it fits in the slot that
501 * stays open at the end.
502 */
475 if (initial && sched_feat(START_DEBIT)) 503 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se); 504 vruntime += sched_vslice_add(cfs_rq, se);
477 505
478 if (!initial) { 506 if (!initial) {
507 /* sleeps upto a single latency don't count. */
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && 508 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH) 509 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency; 510 vruntime -= sysctl_sched_latency;
482 511
483 vruntime = max_t(s64, vruntime, se->vruntime); 512 /* ensure we never gain time by being placed backwards. */
513 vruntime = max_vruntime(se->vruntime, vruntime);
484 } 514 }
485 515
486 se->vruntime = vruntime; 516 se->vruntime = vruntime;
487
488} 517}
489 518
490static void 519static void
@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
517 546
518 update_stats_dequeue(cfs_rq, se); 547 update_stats_dequeue(cfs_rq, se);
519 if (sleep) { 548 if (sleep) {
520 se->peer_preempt = 0;
521#ifdef CONFIG_SCHEDSTATS 549#ifdef CONFIG_SCHEDSTATS
522 if (entity_is_task(se)) { 550 if (entity_is_task(se)) {
523 struct task_struct *tsk = task_of(se); 551 struct task_struct *tsk = task_of(se);
@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
545 573
546 ideal_runtime = sched_slice(cfs_rq, curr); 574 ideal_runtime = sched_slice(cfs_rq, curr);
547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 575 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
548 if (delta_exec > ideal_runtime || 576 if (delta_exec > ideal_runtime)
549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
550 resched_task(rq_of(cfs_rq)->curr); 577 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
552} 578}
553 579
554static void 580static void
@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
811 struct task_struct *curr = rq->curr; 837 struct task_struct *curr = rq->curr;
812 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 838 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
813 struct sched_entity *se = &curr->se, *pse = &p->se; 839 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran; 840 unsigned long gran;
815 841
816 if (unlikely(rt_prio(p->prio))) { 842 if (unlikely(rt_prio(p->prio))) {
817 update_rq_clock(rq); 843 update_rq_clock(rq);
@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
826 if (unlikely(p->policy == SCHED_BATCH)) 852 if (unlikely(p->policy == SCHED_BATCH))
827 return; 853 return;
828 854
829 if (sched_feat(WAKEUP_PREEMPT)) { 855 if (!sched_feat(WAKEUP_PREEMPT))
830 while (!is_same_group(se, pse)) { 856 return;
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
834 857
835 delta = se->vruntime - pse->vruntime; 858 while (!is_same_group(se, pse)) {
836 gran = sysctl_sched_wakeup_granularity; 859 se = parent_entity(se);
837 if (unlikely(se->load.weight != NICE_0_LOAD)) 860 pse = parent_entity(pse);
838 gran = calc_delta_fair(gran, &se->load); 861 }
839 862
840 if (delta > gran) { 863 gran = sysctl_sched_wakeup_granularity;
841 int now = !sched_feat(PREEMPT_RESTRICT); 864 if (unlikely(se->load.weight != NICE_0_LOAD))
865 gran = calc_delta_fair(gran, &se->load);
842 866
843 if (now || p->prio < curr->prio || !se->peer_preempt++) 867 if (pse->vruntime + gran < se->vruntime)
844 resched_task(curr); 868 resched_task(curr);
845 }
846 }
847} 869}
848 870
849static struct task_struct *pick_next_task_fair(struct rq *rq) 871static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1045 update_curr(cfs_rq); 1067 update_curr(cfs_rq);
1046 place_entity(cfs_rq, se, 1); 1068 place_entity(cfs_rq, se, 1);
1047 1069
1070 /* 'curr' will be NULL if the child belongs to a different group */
1048 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1071 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1049 curr->vruntime < se->vruntime) { 1072 curr && curr->vruntime < se->vruntime) {
1050 /* 1073 /*
1051 * Upon rescheduling, sched_class::put_prev_task() will place 1074 * Upon rescheduling, sched_class::put_prev_task() will place
1052 * 'current' within the tree based on its new key value. 1075 * 'current' within the tree based on its new key value.
@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1054 swap(curr->vruntime, se->vruntime); 1077 swap(curr->vruntime, se->vruntime);
1055 } 1078 }
1056 1079
1057 se->peer_preempt = 0;
1058 enqueue_task_fair(rq, p, 0); 1080 enqueue_task_fair(rq, p, 0);
1059 resched_task(rq->curr); 1081 resched_task(rq->curr);
1060} 1082}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ef1a7df80ea2..630178e53bb6 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
127# define schedstat_set(var, val) do { } while (0) 127# define schedstat_set(var, val) do { } while (0)
128#endif 128#endif
129 129
130#ifdef CONFIG_SCHEDSTATS 130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/* 131/*
132 * Called when a process is dequeued from the active array and given 132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive 133 * the cpu. We should note that with the exception of interactive
@@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
155 */ 155 */
156static void sched_info_arrive(struct task_struct *t) 156static void sched_info_arrive(struct task_struct *t)
157{ 157{
158 unsigned long long now = sched_clock(), delta = 0; 158 unsigned long long now = task_rq(t)->clock, delta = 0;
159 159
160 if (t->sched_info.last_queued) 160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued; 161 delta = now - t->sched_info.last_queued;
@@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t)
186{ 186{
187 if (unlikely(sched_info_on())) 187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued) 188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock(); 189 t->sched_info.last_queued = task_rq(t)->clock;
190} 190}
191 191
192/* 192/*
@@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t)
195 */ 195 */
196static inline void sched_info_depart(struct task_struct *t) 196static inline void sched_info_depart(struct task_struct *t)
197{ 197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival; 198 unsigned long long delta = task_rq(t)->clock -
199 t->sched_info.last_arrival;
199 200
200 t->sched_info.cpu_time += delta; 201 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta); 202 rq_sched_info_depart(task_rq(t), delta);
@@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
231#else 232#else
232#define sched_info_queued(t) do { } while (0) 233#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0) 234#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS */ 235#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235 236
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b4efbe26445..3a1744fed2b6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -226,20 +226,23 @@ static struct ctl_table root_table[] = {
226 226
227#ifdef CONFIG_SCHED_DEBUG 227#ifdef CONFIG_SCHED_DEBUG
228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ 228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
229static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ 229static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ 230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
231static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ 231static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
232#endif 232#endif
233 233
234static struct ctl_table kern_table[] = { 234static struct ctl_table kern_table[] = {
235#ifdef CONFIG_SCHED_DEBUG 235#ifdef CONFIG_SCHED_DEBUG
236 { 236 {
237 .ctl_name = CTL_UNNUMBERED, 237 .ctl_name = CTL_UNNUMBERED,
238 .procname = "sched_nr_latency", 238 .procname = "sched_min_granularity_ns",
239 .data = &sysctl_sched_nr_latency, 239 .data = &sysctl_sched_min_granularity,
240 .maxlen = sizeof(unsigned int), 240 .maxlen = sizeof(unsigned int),
241 .mode = 0644, 241 .mode = 0644,
242 .proc_handler = &proc_dointvec, 242 .proc_handler = &sched_nr_latency_handler,
243 .strategy = &sysctl_intvec,
244 .extra1 = &min_sched_granularity_ns,
245 .extra2 = &max_sched_granularity_ns,
243 }, 246 },
244 { 247 {
245 .ctl_name = CTL_UNNUMBERED, 248 .ctl_name = CTL_UNNUMBERED,
@@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = {
247 .data = &sysctl_sched_latency, 250 .data = &sysctl_sched_latency,
248 .maxlen = sizeof(unsigned int), 251 .maxlen = sizeof(unsigned int),
249 .mode = 0644, 252 .mode = 0644,
250 .proc_handler = &proc_dointvec_minmax, 253 .proc_handler = &sched_nr_latency_handler,
251 .strategy = &sysctl_intvec, 254 .strategy = &sysctl_intvec,
252 .extra1 = &min_sched_granularity_ns, 255 .extra1 = &min_sched_granularity_ns,
253 .extra2 = &max_sched_granularity_ns, 256 .extra2 = &max_sched_granularity_ns,
@@ -298,6 +301,14 @@ static struct ctl_table kern_table[] = {
298 .mode = 0644, 301 .mode = 0644,
299 .proc_handler = &proc_dointvec, 302 .proc_handler = &proc_dointvec,
300 }, 303 },
304 {
305 .ctl_name = CTL_UNNUMBERED,
306 .procname = "sched_nr_migrate",
307 .data = &sysctl_sched_nr_migrate,
308 .maxlen = sizeof(unsigned int),
309 .mode = 644,
310 .proc_handler = &proc_dointvec,
311 },
301#endif 312#endif
302 { 313 {
303 .ctl_name = CTL_UNNUMBERED, 314 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/timer.c b/kernel/timer.c
index 00e44e2afd67..a05817c021d6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void)
817 817
818#endif 818#endif
819 819
820#ifndef CONFIG_VIRT_CPU_ACCOUNTING
821void account_process_tick(struct task_struct *p, int user_tick)
822{
823 if (user_tick) {
824 account_user_time(p, jiffies_to_cputime(1));
825 account_user_time_scaled(p, jiffies_to_cputime(1));
826 } else {
827 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
828 account_system_time_scaled(p, jiffies_to_cputime(1));
829 }
830}
831#endif
832
820/* 833/*
821 * Called from the timer interrupt handler to charge one tick to the current 834 * Called from the timer interrupt handler to charge one tick to the current
822 * process. user_tick is 1 if the tick is user time, 0 for system. 835 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -827,13 +840,7 @@ void update_process_times(int user_tick)
827 int cpu = smp_processor_id(); 840 int cpu = smp_processor_id();
828 841
829 /* Note: this timer irq context must be accounted for as well. */ 842 /* Note: this timer irq context must be accounted for as well. */
830 if (user_tick) { 843 account_process_tick(p, user_tick);
831 account_user_time(p, jiffies_to_cputime(1));
832 account_user_time_scaled(p, jiffies_to_cputime(1));
833 } else {
834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
835 account_system_time_scaled(p, jiffies_to_cputime(1));
836 }
837 run_local_timers(); 844 run_local_timers();
838 if (rcu_pending(cpu)) 845 if (rcu_pending(cpu))
839 rcu_check_callbacks(cpu, user_tick); 846 rcu_check_callbacks(cpu, user_tick);