aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c96
1 files changed, 59 insertions, 37 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f662ab7..d3c03070872d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * Targeted preemption latency for CPU-bound tasks: 24 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms * ilog(ncpus), units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length 28 * 'timeslice length' - timeslices in CFS are of variable length
@@ -32,19 +32,24 @@
32 * (to see the precise effective timeslice length of your workload, 32 * (to see the precise effective timeslice length of your workload,
33 * run vmstat and monitor the context-switches (cs) field) 33 * run vmstat and monitor the context-switches (cs) field)
34 */ 34 */
35const_debug unsigned int sysctl_sched_latency = 20000000ULL; 35unsigned int sysctl_sched_latency = 20000000ULL;
36 36
37/* 37/*
38 * After fork, child runs first. (default) If set to 0 then 38 * Minimal preemption granularity for CPU-bound tasks:
39 * parent will (try to) run first. 39 * (default: 1 msec * ilog(ncpus), units: nanoseconds)
40 */ 40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1; 41unsigned int sysctl_sched_min_granularity = 1000000ULL;
42 42
43/* 43/*
44 * Minimal preemption granularity for CPU-bound tasks: 44 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
45 * (default: 2 msec, units: nanoseconds) 45 */
46unsigned int sched_nr_latency = 20;
47
48/*
49 * After fork, child runs first. (default) If set to 0 then
50 * parent will (try to) run first.
46 */ 51 */
47const_debug unsigned int sysctl_sched_nr_latency = 20; 52const_debug unsigned int sysctl_sched_child_runs_first = 1;
48 53
49/* 54/*
50 * sys_sched_yield() compat mode 55 * sys_sched_yield() compat mode
@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
56 61
57/* 62/*
58 * SCHED_BATCH wake-up granularity. 63 * SCHED_BATCH wake-up granularity.
59 * (default: 10 msec, units: nanoseconds) 64 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
60 * 65 *
61 * This option delays the preemption effects of decoupled workloads 66 * This option delays the preemption effects of decoupled workloads
62 * and reduces their over-scheduling. Synchronous workloads will still 67 * and reduces their over-scheduling. Synchronous workloads will still
63 * have immediate wakeup/sleep latencies. 68 * have immediate wakeup/sleep latencies.
64 */ 69 */
65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; 70unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
66 71
67/* 72/*
68 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
69 * (default: 10 msec, units: nanoseconds) 74 * (default: 10 msec * ilog(ncpus), units: nanoseconds)
70 * 75 *
71 * This option delays the preemption effects of decoupled workloads 76 * This option delays the preemption effects of decoupled workloads
72 * and reduces their over-scheduling. Synchronous workloads will still 77 * and reduces their over-scheduling. Synchronous workloads will still
73 * have immediate wakeup/sleep latencies. 78 * have immediate wakeup/sleep latencies.
74 */ 79 */
75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 80unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
76 81
77const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 82const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
78 83
@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
212 * Scheduling class statistics methods: 217 * Scheduling class statistics methods:
213 */ 218 */
214 219
220#ifdef CONFIG_SCHED_DEBUG
221int sched_nr_latency_handler(struct ctl_table *table, int write,
222 struct file *filp, void __user *buffer, size_t *lenp,
223 loff_t *ppos)
224{
225 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
226
227 if (ret || !write)
228 return ret;
229
230 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
231 sysctl_sched_min_granularity);
232
233 return 0;
234}
235#endif
215 236
216/* 237/*
217 * The idea is to set a period in which each task runs once. 238 * The idea is to set a period in which each task runs once.
@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
224static u64 __sched_period(unsigned long nr_running) 245static u64 __sched_period(unsigned long nr_running)
225{ 246{
226 u64 period = sysctl_sched_latency; 247 u64 period = sysctl_sched_latency;
227 unsigned long nr_latency = sysctl_sched_nr_latency; 248 unsigned long nr_latency = sched_nr_latency;
228 249
229 if (unlikely(nr_running > nr_latency)) { 250 if (unlikely(nr_running > nr_latency)) {
230 period *= nr_running; 251 period *= nr_running;
@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
259{ 280{
260 u64 vslice = __sched_period(nr_running); 281 u64 vslice = __sched_period(nr_running);
261 282
283 vslice *= NICE_0_LOAD;
262 do_div(vslice, rq_weight); 284 do_div(vslice, rq_weight);
263 285
264 return vslice; 286 return vslice;
@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) 494 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2; 495 vruntime += sched_vslice(cfs_rq)/2;
474 496
497 /*
498 * The 'current' period is already promised to the current tasks,
499 * however the extra weight of the new task will slow them down a
500 * little, place the new task so that it fits in the slot that
501 * stays open at the end.
502 */
475 if (initial && sched_feat(START_DEBIT)) 503 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se); 504 vruntime += sched_vslice_add(cfs_rq, se);
477 505
478 if (!initial) { 506 if (!initial) {
507 /* sleeps upto a single latency don't count. */
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && 508 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH) 509 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency; 510 vruntime -= sysctl_sched_latency;
482 511
483 vruntime = max_t(s64, vruntime, se->vruntime); 512 /* ensure we never gain time by being placed backwards. */
513 vruntime = max_vruntime(se->vruntime, vruntime);
484 } 514 }
485 515
486 se->vruntime = vruntime; 516 se->vruntime = vruntime;
487
488} 517}
489 518
490static void 519static void
@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
517 546
518 update_stats_dequeue(cfs_rq, se); 547 update_stats_dequeue(cfs_rq, se);
519 if (sleep) { 548 if (sleep) {
520 se->peer_preempt = 0;
521#ifdef CONFIG_SCHEDSTATS 549#ifdef CONFIG_SCHEDSTATS
522 if (entity_is_task(se)) { 550 if (entity_is_task(se)) {
523 struct task_struct *tsk = task_of(se); 551 struct task_struct *tsk = task_of(se);
@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
545 573
546 ideal_runtime = sched_slice(cfs_rq, curr); 574 ideal_runtime = sched_slice(cfs_rq, curr);
547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 575 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
548 if (delta_exec > ideal_runtime || 576 if (delta_exec > ideal_runtime)
549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
550 resched_task(rq_of(cfs_rq)->curr); 577 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
552} 578}
553 579
554static void 580static void
@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
811 struct task_struct *curr = rq->curr; 837 struct task_struct *curr = rq->curr;
812 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 838 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
813 struct sched_entity *se = &curr->se, *pse = &p->se; 839 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran; 840 unsigned long gran;
815 841
816 if (unlikely(rt_prio(p->prio))) { 842 if (unlikely(rt_prio(p->prio))) {
817 update_rq_clock(rq); 843 update_rq_clock(rq);
@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
826 if (unlikely(p->policy == SCHED_BATCH)) 852 if (unlikely(p->policy == SCHED_BATCH))
827 return; 853 return;
828 854
829 if (sched_feat(WAKEUP_PREEMPT)) { 855 if (!sched_feat(WAKEUP_PREEMPT))
830 while (!is_same_group(se, pse)) { 856 return;
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
834 857
835 delta = se->vruntime - pse->vruntime; 858 while (!is_same_group(se, pse)) {
836 gran = sysctl_sched_wakeup_granularity; 859 se = parent_entity(se);
837 if (unlikely(se->load.weight != NICE_0_LOAD)) 860 pse = parent_entity(pse);
838 gran = calc_delta_fair(gran, &se->load); 861 }
839 862
840 if (delta > gran) { 863 gran = sysctl_sched_wakeup_granularity;
841 int now = !sched_feat(PREEMPT_RESTRICT); 864 if (unlikely(se->load.weight != NICE_0_LOAD))
865 gran = calc_delta_fair(gran, &se->load);
842 866
843 if (now || p->prio < curr->prio || !se->peer_preempt++) 867 if (pse->vruntime + gran < se->vruntime)
844 resched_task(curr); 868 resched_task(curr);
845 }
846 }
847} 869}
848 870
849static struct task_struct *pick_next_task_fair(struct rq *rq) 871static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1045 update_curr(cfs_rq); 1067 update_curr(cfs_rq);
1046 place_entity(cfs_rq, se, 1); 1068 place_entity(cfs_rq, se, 1);
1047 1069
1070 /* 'curr' will be NULL if the child belongs to a different group */
1048 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1071 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1049 curr->vruntime < se->vruntime) { 1072 curr && curr->vruntime < se->vruntime) {
1050 /* 1073 /*
1051 * Upon rescheduling, sched_class::put_prev_task() will place 1074 * Upon rescheduling, sched_class::put_prev_task() will place
1052 * 'current' within the tree based on its new key value. 1075 * 'current' within the tree based on its new key value.
@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1054 swap(curr->vruntime, se->vruntime); 1077 swap(curr->vruntime, se->vruntime);
1055 } 1078 }
1056 1079
1057 se->peer_preempt = 0;
1058 enqueue_task_fair(rq, p, 0); 1080 enqueue_task_fair(rq, p, 0);
1059 resched_task(rq->curr); 1081 resched_task(rq->curr);
1060} 1082}