aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex_compat.c26
-rw-r--r--kernel/sched.c59
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c96
-rw-r--r--kernel/sched_stats.h11
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/timer.c21
8 files changed, 163 insertions, 81 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 28a740151988..8ca1a14cdc8c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1123 p->blocked_on = NULL; /* not blocked yet */ 1123 p->blocked_on = NULL; /* not blocked yet */
1124#endif 1124#endif
1125 1125
1126 /* Perform scheduler related setup. Assign this task to a CPU. */
1127 sched_fork(p, clone_flags);
1128
1126 if ((retval = security_task_alloc(p))) 1129 if ((retval = security_task_alloc(p)))
1127 goto bad_fork_cleanup_policy; 1130 goto bad_fork_cleanup_policy;
1128 if ((retval = audit_alloc(p))) 1131 if ((retval = audit_alloc(p)))
@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1212 INIT_LIST_HEAD(&p->ptrace_children); 1215 INIT_LIST_HEAD(&p->ptrace_children);
1213 INIT_LIST_HEAD(&p->ptrace_list); 1216 INIT_LIST_HEAD(&p->ptrace_list);
1214 1217
1215 /* Perform scheduler related setup. Assign this task to a CPU. */
1216 sched_fork(p, clone_flags);
1217
1218 /* Now that the task is set up, run cgroup callbacks if 1218 /* Now that the task is set up, run cgroup callbacks if
1219 * necessary. We need to run them before the task is visible 1219 * necessary. We need to run them before the task is visible
1220 * on the tasklist. */ 1220 * on the tasklist. */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 00b572666cc7..0a43def6fee7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,6 +30,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
30 return 0; 30 return 0;
31} 31}
32 32
33static void __user *futex_uaddr(struct robust_list *entry,
34 compat_long_t futex_offset)
35{
36 compat_uptr_t base = ptr_to_compat(entry);
37 void __user *uaddr = compat_ptr(base + futex_offset);
38
39 return uaddr;
40}
41
33/* 42/*
34 * Walk curr->robust_list (very carefully, it's a userspace list!) 43 * Walk curr->robust_list (very carefully, it's a userspace list!)
35 * and mark any locks found there dead, and notify any waiters. 44 * and mark any locks found there dead, and notify any waiters.
@@ -76,11 +85,12 @@ void compat_exit_robust_list(struct task_struct *curr)
76 * A pending lock might already be on the list, so 85 * A pending lock might already be on the list, so
77 * dont process it twice: 86 * dont process it twice:
78 */ 87 */
79 if (entry != pending) 88 if (entry != pending) {
80 if (handle_futex_death((void __user *)entry + futex_offset, 89 void __user *uaddr = futex_uaddr(entry, futex_offset);
81 curr, pi))
82 return;
83 90
91 if (handle_futex_death(uaddr, curr, pi))
92 return;
93 }
84 if (rc) 94 if (rc)
85 return; 95 return;
86 uentry = next_uentry; 96 uentry = next_uentry;
@@ -94,9 +104,11 @@ void compat_exit_robust_list(struct task_struct *curr)
94 104
95 cond_resched(); 105 cond_resched();
96 } 106 }
97 if (pending) 107 if (pending) {
98 handle_futex_death((void __user *)pending + futex_offset, 108 void __user *uaddr = futex_uaddr(pending, futex_offset);
99 curr, pip); 109
110 handle_futex_death(uaddr, curr, pip);
111 }
100} 112}
101 113
102asmlinkage long 114asmlinkage long
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd1112900..b18f231a4875 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,7 +75,7 @@
75 */ 75 */
76unsigned long long __attribute__((weak)) sched_clock(void) 76unsigned long long __attribute__((weak)) sched_clock(void)
77{ 77{
78 return (unsigned long long)jiffies * (1000000000 / HZ); 78 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
79} 79}
80 80
81/* 81/*
@@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
99/* 99/*
100 * Some helpers for converting nanosecond timing to jiffy resolution 100 * Some helpers for converting nanosecond timing to jiffy resolution
101 */ 101 */
102#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) 102#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
103#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 103#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
104 104
105#define NICE_0_LOAD SCHED_LOAD_SCALE 105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT 106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -460,7 +460,6 @@ enum {
460 SCHED_FEAT_TREE_AVG = 4, 460 SCHED_FEAT_TREE_AVG = 4,
461 SCHED_FEAT_APPROX_AVG = 8, 461 SCHED_FEAT_APPROX_AVG = 8,
462 SCHED_FEAT_WAKEUP_PREEMPT = 16, 462 SCHED_FEAT_WAKEUP_PREEMPT = 16,
463 SCHED_FEAT_PREEMPT_RESTRICT = 32,
464}; 463};
465 464
466const_debug unsigned int sysctl_sched_features = 465const_debug unsigned int sysctl_sched_features =
@@ -468,12 +467,17 @@ const_debug unsigned int sysctl_sched_features =
468 SCHED_FEAT_START_DEBIT * 1 | 467 SCHED_FEAT_START_DEBIT * 1 |
469 SCHED_FEAT_TREE_AVG * 0 | 468 SCHED_FEAT_TREE_AVG * 0 |
470 SCHED_FEAT_APPROX_AVG * 0 | 469 SCHED_FEAT_APPROX_AVG * 0 |
471 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 470 SCHED_FEAT_WAKEUP_PREEMPT * 1;
472 SCHED_FEAT_PREEMPT_RESTRICT * 1;
473 471
474#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 472#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
475 473
476/* 474/*
475 * Number of tasks to iterate in a single balance run.
476 * Limited because this is done with IRQs disabled.
477 */
478const_debug unsigned int sysctl_sched_nr_migrate = 32;
479
480/*
477 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 481 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
478 * clock constructed from sched_clock(): 482 * clock constructed from sched_clock():
479 */ 483 */
@@ -2237,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2237 enum cpu_idle_type idle, int *all_pinned, 2241 enum cpu_idle_type idle, int *all_pinned,
2238 int *this_best_prio, struct rq_iterator *iterator) 2242 int *this_best_prio, struct rq_iterator *iterator)
2239{ 2243{
2240 int pulled = 0, pinned = 0, skip_for_load; 2244 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2241 struct task_struct *p; 2245 struct task_struct *p;
2242 long rem_load_move = max_load_move; 2246 long rem_load_move = max_load_move;
2243 2247
@@ -2251,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2251 */ 2255 */
2252 p = iterator->start(iterator->arg); 2256 p = iterator->start(iterator->arg);
2253next: 2257next:
2254 if (!p) 2258 if (!p || loops++ > sysctl_sched_nr_migrate)
2255 goto out; 2259 goto out;
2256 /* 2260 /*
2257 * To help distribute high priority tasks accross CPUs we don't 2261 * To help distribute high priority tasks across CPUs we don't
2258 * skip a task if it will be the highest priority task (i.e. smallest 2262 * skip a task if it will be the highest priority task (i.e. smallest
2259 * prio value) on its new queue regardless of its load weight 2263 * prio value) on its new queue regardless of its load weight
2260 */ 2264 */
@@ -2271,8 +2275,7 @@ next:
2271 rem_load_move -= p->se.load.weight; 2275 rem_load_move -= p->se.load.weight;
2272 2276
2273 /* 2277 /*
2274 * We only want to steal up to the prescribed number of tasks 2278 * We only want to steal up to the prescribed amount of weighted load.
2275 * and the prescribed amount of weighted load.
2276 */ 2279 */
2277 if (rem_load_move > 0) { 2280 if (rem_load_move > 0) {
2278 if (p->prio < *this_best_prio) 2281 if (p->prio < *this_best_prio)
@@ -4992,6 +4995,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 */ 4995 */
4993cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4996cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4994 4997
4998/*
4999 * Increase the granularity value when there are more CPUs,
5000 * because with more CPUs the 'effective latency' as visible
5001 * to users decreases. But the relationship is not linear,
5002 * so pick a second-best guess by going with the log2 of the
5003 * number of CPUs.
5004 *
5005 * This idea comes from the SD scheduler of Con Kolivas:
5006 */
5007static inline void sched_init_granularity(void)
5008{
5009 unsigned int factor = 1 + ilog2(num_online_cpus());
5010 const unsigned long limit = 200000000;
5011
5012 sysctl_sched_min_granularity *= factor;
5013 if (sysctl_sched_min_granularity > limit)
5014 sysctl_sched_min_granularity = limit;
5015
5016 sysctl_sched_latency *= factor;
5017 if (sysctl_sched_latency > limit)
5018 sysctl_sched_latency = limit;
5019
5020 sysctl_sched_wakeup_granularity *= factor;
5021 sysctl_sched_batch_wakeup_granularity *= factor;
5022}
5023
4995#ifdef CONFIG_SMP 5024#ifdef CONFIG_SMP
4996/* 5025/*
4997 * This is how migration works: 5026 * This is how migration works:
@@ -5621,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
5621 .priority = 10 5650 .priority = 10
5622}; 5651};
5623 5652
5624int __init migration_init(void) 5653void __init migration_init(void)
5625{ 5654{
5626 void *cpu = (void *)(long)smp_processor_id(); 5655 void *cpu = (void *)(long)smp_processor_id();
5627 int err; 5656 int err;
@@ -5631,8 +5660,6 @@ int __init migration_init(void)
5631 BUG_ON(err == NOTIFY_BAD); 5660 BUG_ON(err == NOTIFY_BAD);
5632 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5661 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5633 register_cpu_notifier(&migration_notifier); 5662 register_cpu_notifier(&migration_notifier);
5634
5635 return 0;
5636} 5663}
5637#endif 5664#endif
5638 5665
@@ -6688,10 +6715,12 @@ void __init sched_init_smp(void)
6688 /* Move init over to a non-isolated CPU */ 6715 /* Move init over to a non-isolated CPU */
6689 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6716 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6690 BUG(); 6717 BUG();
6718 sched_init_granularity();
6691} 6719}
6692#else 6720#else
6693void __init sched_init_smp(void) 6721void __init sched_init_smp(void)
6694{ 6722{
6723 sched_init_granularity();
6695} 6724}
6696#endif /* CONFIG_SMP */ 6725#endif /* CONFIG_SMP */
6697 6726
@@ -7228,7 +7257,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
7228 spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); 7257 spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
7229 } 7258 }
7230 /* Convert from ns to ms */ 7259 /* Convert from ns to ms */
7231 do_div(res, 1000000); 7260 do_div(res, NSEC_PER_MSEC);
7232 7261
7233 return res; 7262 return res;
7234} 7263}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 415e5c385542..ca198a797bfa 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
211#define PN(x) \ 211#define PN(x) \
212 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 212 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
213 PN(sysctl_sched_latency); 213 PN(sysctl_sched_latency);
214 PN(sysctl_sched_nr_latency); 214 PN(sysctl_sched_min_granularity);
215 PN(sysctl_sched_wakeup_granularity); 215 PN(sysctl_sched_wakeup_granularity);
216 PN(sysctl_sched_batch_wakeup_granularity); 216 PN(sysctl_sched_batch_wakeup_granularity);
217 PN(sysctl_sched_child_runs_first); 217 PN(sysctl_sched_child_runs_first);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f662ab7..d3c03070872d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * Targeted preemption latency for CPU-bound tasks: 24 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms * ilog(ncpus), units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length 28 * 'timeslice length' - timeslices in CFS are of variable length
@@ -32,19 +32,24 @@
32 * (to see the precise effective timeslice length of your workload, 32 * (to see the precise effective timeslice length of your workload,
33 * run vmstat and monitor the context-switches (cs) field) 33 * run vmstat and monitor the context-switches (cs) field)
34 */ 34 */
35const_debug unsigned int sysctl_sched_latency = 20000000ULL; 35unsigned int sysctl_sched_latency = 20000000ULL;
36 36
37/* 37/*
38 * After fork, child runs first. (default) If set to 0 then 38 * Minimal preemption granularity for CPU-bound tasks:
39 * parent will (try to) run first. 39 * (default: 1 msec * ilog(ncpus), units: nanoseconds)
40 */ 40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1; 41unsigned int sysctl_sched_min_granularity = 1000000ULL;
42 42
43/* 43/*
44 * Minimal preemption granularity for CPU-bound tasks: 44 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
45 * (default: 2 msec, units: nanoseconds) 45 */
46unsigned int sched_nr_latency = 20;
47
48/*
49 * After fork, child runs first. (default) If set to 0 then
50 * parent will (try to) run first.
46 */ 51 */
47const_debug unsigned int sysctl_sched_nr_latency = 20; 52const_debug unsigned int sysctl_sched_child_runs_first = 1;
48 53
49/* 54/*
50 * sys_sched_yield() compat mode 55 * sys_sched_yield() compat mode
@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
56 61
57/* 62/*
58 * SCHED_BATCH wake-up granularity. 63 * SCHED_BATCH wake-up granularity.
59 * (default: 10 msec, units: nanoseconds) 64 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
60 * 65 *
61 * This option delays the preemption effects of decoupled workloads 66 * This option delays the preemption effects of decoupled workloads
62 * and reduces their over-scheduling. Synchronous workloads will still 67 * and reduces their over-scheduling. Synchronous workloads will still
63 * have immediate wakeup/sleep latencies. 68 * have immediate wakeup/sleep latencies.
64 */ 69 */
65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; 70unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
66 71
67/* 72/*
68 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
69 * (default: 10 msec, units: nanoseconds) 74 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
70 * 75 *
71 * This option delays the preemption effects of decoupled workloads 76 * This option delays the preemption effects of decoupled workloads
72 * and reduces their over-scheduling. Synchronous workloads will still 77 * and reduces their over-scheduling. Synchronous workloads will still
73 * have immediate wakeup/sleep latencies. 78 * have immediate wakeup/sleep latencies.
74 */ 79 */
75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 80unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
76 81
77const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 82const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
78 83
@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
212 * Scheduling class statistics methods: 217 * Scheduling class statistics methods:
213 */ 218 */
214 219
220#ifdef CONFIG_SCHED_DEBUG
221int sched_nr_latency_handler(struct ctl_table *table, int write,
222 struct file *filp, void __user *buffer, size_t *lenp,
223 loff_t *ppos)
224{
225 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
226
227 if (ret || !write)
228 return ret;
229
230 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
231 sysctl_sched_min_granularity);
232
233 return 0;
234}
235#endif
215 236
216/* 237/*
217 * The idea is to set a period in which each task runs once. 238 * The idea is to set a period in which each task runs once.
@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
224static u64 __sched_period(unsigned long nr_running) 245static u64 __sched_period(unsigned long nr_running)
225{ 246{
226 u64 period = sysctl_sched_latency; 247 u64 period = sysctl_sched_latency;
227 unsigned long nr_latency = sysctl_sched_nr_latency; 248 unsigned long nr_latency = sched_nr_latency;
228 249
229 if (unlikely(nr_running > nr_latency)) { 250 if (unlikely(nr_running > nr_latency)) {
230 period *= nr_running; 251 period *= nr_running;
@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
259{ 280{
260 u64 vslice = __sched_period(nr_running); 281 u64 vslice = __sched_period(nr_running);
261 282
283 vslice *= NICE_0_LOAD;
262 do_div(vslice, rq_weight); 284 do_div(vslice, rq_weight);
263 285
264 return vslice; 286 return vslice;
@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) 494 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2; 495 vruntime += sched_vslice(cfs_rq)/2;
474 496
497 /*
498 * The 'current' period is already promised to the current tasks,
499 * however the extra weight of the new task will slow them down a
500 * little, place the new task so that it fits in the slot that
501 * stays open at the end.
502 */
475 if (initial && sched_feat(START_DEBIT)) 503 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se); 504 vruntime += sched_vslice_add(cfs_rq, se);
477 505
478 if (!initial) { 506 if (!initial) {
507 /* sleeps upto a single latency don't count. */
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && 508 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH) 509 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency; 510 vruntime -= sysctl_sched_latency;
482 511
483 vruntime = max_t(s64, vruntime, se->vruntime); 512 /* ensure we never gain time by being placed backwards. */
513 vruntime = max_vruntime(se->vruntime, vruntime);
484 } 514 }
485 515
486 se->vruntime = vruntime; 516 se->vruntime = vruntime;
487
488} 517}
489 518
490static void 519static void
@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
517 546
518 update_stats_dequeue(cfs_rq, se); 547 update_stats_dequeue(cfs_rq, se);
519 if (sleep) { 548 if (sleep) {
520 se->peer_preempt = 0;
521#ifdef CONFIG_SCHEDSTATS 549#ifdef CONFIG_SCHEDSTATS
522 if (entity_is_task(se)) { 550 if (entity_is_task(se)) {
523 struct task_struct *tsk = task_of(se); 551 struct task_struct *tsk = task_of(se);
@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
545 573
546 ideal_runtime = sched_slice(cfs_rq, curr); 574 ideal_runtime = sched_slice(cfs_rq, curr);
547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 575 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
548 if (delta_exec > ideal_runtime || 576 if (delta_exec > ideal_runtime)
549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
550 resched_task(rq_of(cfs_rq)->curr); 577 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
552} 578}
553 579
554static void 580static void
@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
811 struct task_struct *curr = rq->curr; 837 struct task_struct *curr = rq->curr;
812 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 838 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
813 struct sched_entity *se = &curr->se, *pse = &p->se; 839 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran; 840 unsigned long gran;
815 841
816 if (unlikely(rt_prio(p->prio))) { 842 if (unlikely(rt_prio(p->prio))) {
817 update_rq_clock(rq); 843 update_rq_clock(rq);
@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
826 if (unlikely(p->policy == SCHED_BATCH)) 852 if (unlikely(p->policy == SCHED_BATCH))
827 return; 853 return;
828 854
829 if (sched_feat(WAKEUP_PREEMPT)) { 855 if (!sched_feat(WAKEUP_PREEMPT))
830 while (!is_same_group(se, pse)) { 856 return;
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
834 857
835 delta = se->vruntime - pse->vruntime; 858 while (!is_same_group(se, pse)) {
836 gran = sysctl_sched_wakeup_granularity; 859 se = parent_entity(se);
837 if (unlikely(se->load.weight != NICE_0_LOAD)) 860 pse = parent_entity(pse);
838 gran = calc_delta_fair(gran, &se->load); 861 }
839 862
840 if (delta > gran) { 863 gran = sysctl_sched_wakeup_granularity;
841 int now = !sched_feat(PREEMPT_RESTRICT); 864 if (unlikely(se->load.weight != NICE_0_LOAD))
865 gran = calc_delta_fair(gran, &se->load);
842 866
843 if (now || p->prio < curr->prio || !se->peer_preempt++) 867 if (pse->vruntime + gran < se->vruntime)
844 resched_task(curr); 868 resched_task(curr);
845 }
846 }
847} 869}
848 870
849static struct task_struct *pick_next_task_fair(struct rq *rq) 871static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1045 update_curr(cfs_rq); 1067 update_curr(cfs_rq);
1046 place_entity(cfs_rq, se, 1); 1068 place_entity(cfs_rq, se, 1);
1047 1069
1070 /* 'curr' will be NULL if the child belongs to a different group */
1048 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1071 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1049 curr->vruntime < se->vruntime) { 1072 curr && curr->vruntime < se->vruntime) {
1050 /* 1073 /*
1051 * Upon rescheduling, sched_class::put_prev_task() will place 1074 * Upon rescheduling, sched_class::put_prev_task() will place
1052 * 'current' within the tree based on its new key value. 1075 * 'current' within the tree based on its new key value.
@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1054 swap(curr->vruntime, se->vruntime); 1077 swap(curr->vruntime, se->vruntime);
1055 } 1078 }
1056 1079
1057 se->peer_preempt = 0;
1058 enqueue_task_fair(rq, p, 0); 1080 enqueue_task_fair(rq, p, 0);
1059 resched_task(rq->curr); 1081 resched_task(rq->curr);
1060} 1082}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ef1a7df80ea2..630178e53bb6 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
127# define schedstat_set(var, val) do { } while (0) 127# define schedstat_set(var, val) do { } while (0)
128#endif 128#endif
129 129
130#ifdef CONFIG_SCHEDSTATS 130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/* 131/*
132 * Called when a process is dequeued from the active array and given 132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive 133 * the cpu. We should note that with the exception of interactive
@@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
155 */ 155 */
156static void sched_info_arrive(struct task_struct *t) 156static void sched_info_arrive(struct task_struct *t)
157{ 157{
158 unsigned long long now = sched_clock(), delta = 0; 158 unsigned long long now = task_rq(t)->clock, delta = 0;
159 159
160 if (t->sched_info.last_queued) 160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued; 161 delta = now - t->sched_info.last_queued;
@@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t)
186{ 186{
187 if (unlikely(sched_info_on())) 187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued) 188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock(); 189 t->sched_info.last_queued = task_rq(t)->clock;
190} 190}
191 191
192/* 192/*
@@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t)
195 */ 195 */
196static inline void sched_info_depart(struct task_struct *t) 196static inline void sched_info_depart(struct task_struct *t)
197{ 197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival; 198 unsigned long long delta = task_rq(t)->clock -
199 t->sched_info.last_arrival;
199 200
200 t->sched_info.cpu_time += delta; 201 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta); 202 rq_sched_info_depart(task_rq(t), delta);
@@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
231#else 232#else
232#define sched_info_queued(t) do { } while (0) 233#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0) 234#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS */ 235#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235 236
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b4efbe26445..3a1744fed2b6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -226,20 +226,23 @@ static struct ctl_table root_table[] = {
226 226
227#ifdef CONFIG_SCHED_DEBUG 227#ifdef CONFIG_SCHED_DEBUG
228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ 228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
229static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ 229static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ 230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
231static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ 231static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
232#endif 232#endif
233 233
234static struct ctl_table kern_table[] = { 234static struct ctl_table kern_table[] = {
235#ifdef CONFIG_SCHED_DEBUG 235#ifdef CONFIG_SCHED_DEBUG
236 { 236 {
237 .ctl_name = CTL_UNNUMBERED, 237 .ctl_name = CTL_UNNUMBERED,
238 .procname = "sched_nr_latency", 238 .procname = "sched_min_granularity_ns",
239 .data = &sysctl_sched_nr_latency, 239 .data = &sysctl_sched_min_granularity,
240 .maxlen = sizeof(unsigned int), 240 .maxlen = sizeof(unsigned int),
241 .mode = 0644, 241 .mode = 0644,
242 .proc_handler = &proc_dointvec, 242 .proc_handler = &sched_nr_latency_handler,
243 .strategy = &sysctl_intvec,
244 .extra1 = &min_sched_granularity_ns,
245 .extra2 = &max_sched_granularity_ns,
243 }, 246 },
244 { 247 {
245 .ctl_name = CTL_UNNUMBERED, 248 .ctl_name = CTL_UNNUMBERED,
@@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = {
247 .data = &sysctl_sched_latency, 250 .data = &sysctl_sched_latency,
248 .maxlen = sizeof(unsigned int), 251 .maxlen = sizeof(unsigned int),
249 .mode = 0644, 252 .mode = 0644,
250 .proc_handler = &proc_dointvec_minmax, 253 .proc_handler = &sched_nr_latency_handler,
251 .strategy = &sysctl_intvec, 254 .strategy = &sysctl_intvec,
252 .extra1 = &min_sched_granularity_ns, 255 .extra1 = &min_sched_granularity_ns,
253 .extra2 = &max_sched_granularity_ns, 256 .extra2 = &max_sched_granularity_ns,
@@ -298,6 +301,14 @@ static struct ctl_table kern_table[] = {
298 .mode = 0644, 301 .mode = 0644,
299 .proc_handler = &proc_dointvec, 302 .proc_handler = &proc_dointvec,
300 }, 303 },
304 {
305 .ctl_name = CTL_UNNUMBERED,
306 .procname = "sched_nr_migrate",
307 .data = &sysctl_sched_nr_migrate,
308 .maxlen = sizeof(unsigned int),
309 .mode = 644,
310 .proc_handler = &proc_dointvec,
311 },
301#endif 312#endif
302 { 313 {
303 .ctl_name = CTL_UNNUMBERED, 314 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/timer.c b/kernel/timer.c
index 00e44e2afd67..a05817c021d6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void)
817 817
818#endif 818#endif
819 819
820#ifndef CONFIG_VIRT_CPU_ACCOUNTING
821void account_process_tick(struct task_struct *p, int user_tick)
822{
823 if (user_tick) {
824 account_user_time(p, jiffies_to_cputime(1));
825 account_user_time_scaled(p, jiffies_to_cputime(1));
826 } else {
827 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
828 account_system_time_scaled(p, jiffies_to_cputime(1));
829 }
830}
831#endif
832
820/* 833/*
821 * Called from the timer interrupt handler to charge one tick to the current 834 * Called from the timer interrupt handler to charge one tick to the current
822 * process. user_tick is 1 if the tick is user time, 0 for system. 835 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -827,13 +840,7 @@ void update_process_times(int user_tick)
827 int cpu = smp_processor_id(); 840 int cpu = smp_processor_id();
828 841
829 /* Note: this timer irq context must be accounted for as well. */ 842 /* Note: this timer irq context must be accounted for as well. */
830 if (user_tick) { 843 account_process_tick(p, user_tick);
831 account_user_time(p, jiffies_to_cputime(1));
832 account_user_time_scaled(p, jiffies_to_cputime(1));
833 } else {
834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
835 account_system_time_scaled(p, jiffies_to_cputime(1));
836 }
837 run_local_timers(); 844 run_local_timers();
838 if (rcu_pending(cpu)) 845 if (rcu_pending(cpu))
839 rcu_check_callbacks(cpu, user_tick); 846 rcu_check_callbacks(cpu, user_tick);