diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 96 |
1 files changed, 59 insertions, 37 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 01859f662ab7..d3c03070872d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -22,7 +22,7 @@ | |||
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 24 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms, units: nanoseconds) | 25 | * (default: 20ms * ilog(ncpus), units: nanoseconds) |
26 | * | 26 | * |
27 | * NOTE: this latency value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
28 | * 'timeslice length' - timeslices in CFS are of variable length | 28 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -32,19 +32,24 @@ | |||
32 | * (to see the precise effective timeslice length of your workload, | 32 | * (to see the precise effective timeslice length of your workload, |
33 | * run vmstat and monitor the context-switches (cs) field) | 33 | * run vmstat and monitor the context-switches (cs) field) |
34 | */ | 34 | */ |
35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; | 35 | unsigned int sysctl_sched_latency = 20000000ULL; |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * After fork, child runs first. (default) If set to 0 then | 38 | * Minimal preemption granularity for CPU-bound tasks: |
39 | * parent will (try to) run first. | 39 | * (default: 1 msec * ilog(ncpus), units: nanoseconds) |
40 | */ | 40 | */ |
41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | 41 | unsigned int sysctl_sched_min_granularity = 1000000ULL; |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Minimal preemption granularity for CPU-bound tasks: | 44 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
45 | * (default: 2 msec, units: nanoseconds) | 45 | */ |
46 | unsigned int sched_nr_latency = 20; | ||
47 | |||
48 | /* | ||
49 | * After fork, child runs first. (default) If set to 0 then | ||
50 | * parent will (try to) run first. | ||
46 | */ | 51 | */ |
47 | const_debug unsigned int sysctl_sched_nr_latency = 20; | 52 | const_debug unsigned int sysctl_sched_child_runs_first = 1; |
48 | 53 | ||
49 | /* | 54 | /* |
50 | * sys_sched_yield() compat mode | 55 | * sys_sched_yield() compat mode |
@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
56 | 61 | ||
57 | /* | 62 | /* |
58 | * SCHED_BATCH wake-up granularity. | 63 | * SCHED_BATCH wake-up granularity. |
59 | * (default: 10 msec, units: nanoseconds) | 64 | * (default: 10 msec * ilog(ncpus), units: nanoseconds) |
60 | * | 65 | * |
61 | * This option delays the preemption effects of decoupled workloads | 66 | * This option delays the preemption effects of decoupled workloads |
62 | * and reduces their over-scheduling. Synchronous workloads will still | 67 | * and reduces their over-scheduling. Synchronous workloads will still |
63 | * have immediate wakeup/sleep latencies. | 68 | * have immediate wakeup/sleep latencies. |
64 | */ | 69 | */ |
65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | 70 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
66 | 71 | ||
67 | /* | 72 | /* |
68 | * SCHED_OTHER wake-up granularity. | 73 | * SCHED_OTHER wake-up granularity. |
69 | * (default: 10 msec, units: nanoseconds) | 74 | * (default: 10 msec * ilog(ncpus), units: nanoseconds) |
70 | * | 75 | * |
71 | * This option delays the preemption effects of decoupled workloads | 76 | * This option delays the preemption effects of decoupled workloads |
72 | * and reduces their over-scheduling. Synchronous workloads will still | 77 | * and reduces their over-scheduling. Synchronous workloads will still |
73 | * have immediate wakeup/sleep latencies. | 78 | * have immediate wakeup/sleep latencies. |
74 | */ | 79 | */ |
75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 80 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
76 | 81 | ||
77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 82 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
78 | 83 | ||
@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
212 | * Scheduling class statistics methods: | 217 | * Scheduling class statistics methods: |
213 | */ | 218 | */ |
214 | 219 | ||
220 | #ifdef CONFIG_SCHED_DEBUG | ||
221 | int sched_nr_latency_handler(struct ctl_table *table, int write, | ||
222 | struct file *filp, void __user *buffer, size_t *lenp, | ||
223 | loff_t *ppos) | ||
224 | { | ||
225 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
226 | |||
227 | if (ret || !write) | ||
228 | return ret; | ||
229 | |||
230 | sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, | ||
231 | sysctl_sched_min_granularity); | ||
232 | |||
233 | return 0; | ||
234 | } | ||
235 | #endif | ||
215 | 236 | ||
216 | /* | 237 | /* |
217 | * The idea is to set a period in which each task runs once. | 238 | * The idea is to set a period in which each task runs once. |
@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
224 | static u64 __sched_period(unsigned long nr_running) | 245 | static u64 __sched_period(unsigned long nr_running) |
225 | { | 246 | { |
226 | u64 period = sysctl_sched_latency; | 247 | u64 period = sysctl_sched_latency; |
227 | unsigned long nr_latency = sysctl_sched_nr_latency; | 248 | unsigned long nr_latency = sched_nr_latency; |
228 | 249 | ||
229 | if (unlikely(nr_running > nr_latency)) { | 250 | if (unlikely(nr_running > nr_latency)) { |
230 | period *= nr_running; | 251 | period *= nr_running; |
@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | |||
259 | { | 280 | { |
260 | u64 vslice = __sched_period(nr_running); | 281 | u64 vslice = __sched_period(nr_running); |
261 | 282 | ||
283 | vslice *= NICE_0_LOAD; | ||
262 | do_div(vslice, rq_weight); | 284 | do_div(vslice, rq_weight); |
263 | 285 | ||
264 | return vslice; | 286 | return vslice; |
@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | 494 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) |
473 | vruntime += sched_vslice(cfs_rq)/2; | 495 | vruntime += sched_vslice(cfs_rq)/2; |
474 | 496 | ||
497 | /* | ||
498 | * The 'current' period is already promised to the current tasks, | ||
499 | * however the extra weight of the new task will slow them down a | ||
500 | * little, place the new task so that it fits in the slot that | ||
501 | * stays open at the end. | ||
502 | */ | ||
475 | if (initial && sched_feat(START_DEBIT)) | 503 | if (initial && sched_feat(START_DEBIT)) |
476 | vruntime += sched_vslice_add(cfs_rq, se); | 504 | vruntime += sched_vslice_add(cfs_rq, se); |
477 | 505 | ||
478 | if (!initial) { | 506 | if (!initial) { |
507 | /* sleeps upto a single latency don't count. */ | ||
479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | 508 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && |
480 | task_of(se)->policy != SCHED_BATCH) | 509 | task_of(se)->policy != SCHED_BATCH) |
481 | vruntime -= sysctl_sched_latency; | 510 | vruntime -= sysctl_sched_latency; |
482 | 511 | ||
483 | vruntime = max_t(s64, vruntime, se->vruntime); | 512 | /* ensure we never gain time by being placed backwards. */ |
513 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
484 | } | 514 | } |
485 | 515 | ||
486 | se->vruntime = vruntime; | 516 | se->vruntime = vruntime; |
487 | |||
488 | } | 517 | } |
489 | 518 | ||
490 | static void | 519 | static void |
@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
517 | 546 | ||
518 | update_stats_dequeue(cfs_rq, se); | 547 | update_stats_dequeue(cfs_rq, se); |
519 | if (sleep) { | 548 | if (sleep) { |
520 | se->peer_preempt = 0; | ||
521 | #ifdef CONFIG_SCHEDSTATS | 549 | #ifdef CONFIG_SCHEDSTATS |
522 | if (entity_is_task(se)) { | 550 | if (entity_is_task(se)) { |
523 | struct task_struct *tsk = task_of(se); | 551 | struct task_struct *tsk = task_of(se); |
@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
545 | 573 | ||
546 | ideal_runtime = sched_slice(cfs_rq, curr); | 574 | ideal_runtime = sched_slice(cfs_rq, curr); |
547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 575 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
548 | if (delta_exec > ideal_runtime || | 576 | if (delta_exec > ideal_runtime) |
549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) | ||
550 | resched_task(rq_of(cfs_rq)->curr); | 577 | resched_task(rq_of(cfs_rq)->curr); |
551 | curr->peer_preempt = 0; | ||
552 | } | 578 | } |
553 | 579 | ||
554 | static void | 580 | static void |
@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
811 | struct task_struct *curr = rq->curr; | 837 | struct task_struct *curr = rq->curr; |
812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 838 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
813 | struct sched_entity *se = &curr->se, *pse = &p->se; | 839 | struct sched_entity *se = &curr->se, *pse = &p->se; |
814 | s64 delta, gran; | 840 | unsigned long gran; |
815 | 841 | ||
816 | if (unlikely(rt_prio(p->prio))) { | 842 | if (unlikely(rt_prio(p->prio))) { |
817 | update_rq_clock(rq); | 843 | update_rq_clock(rq); |
@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
826 | if (unlikely(p->policy == SCHED_BATCH)) | 852 | if (unlikely(p->policy == SCHED_BATCH)) |
827 | return; | 853 | return; |
828 | 854 | ||
829 | if (sched_feat(WAKEUP_PREEMPT)) { | 855 | if (!sched_feat(WAKEUP_PREEMPT)) |
830 | while (!is_same_group(se, pse)) { | 856 | return; |
831 | se = parent_entity(se); | ||
832 | pse = parent_entity(pse); | ||
833 | } | ||
834 | 857 | ||
835 | delta = se->vruntime - pse->vruntime; | 858 | while (!is_same_group(se, pse)) { |
836 | gran = sysctl_sched_wakeup_granularity; | 859 | se = parent_entity(se); |
837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 860 | pse = parent_entity(pse); |
838 | gran = calc_delta_fair(gran, &se->load); | 861 | } |
839 | 862 | ||
840 | if (delta > gran) { | 863 | gran = sysctl_sched_wakeup_granularity; |
841 | int now = !sched_feat(PREEMPT_RESTRICT); | 864 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
865 | gran = calc_delta_fair(gran, &se->load); | ||
842 | 866 | ||
843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | 867 | if (pse->vruntime + gran < se->vruntime) |
844 | resched_task(curr); | 868 | resched_task(curr); |
845 | } | ||
846 | } | ||
847 | } | 869 | } |
848 | 870 | ||
849 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 871 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1045 | update_curr(cfs_rq); | 1067 | update_curr(cfs_rq); |
1046 | place_entity(cfs_rq, se, 1); | 1068 | place_entity(cfs_rq, se, 1); |
1047 | 1069 | ||
1070 | /* 'curr' will be NULL if the child belongs to a different group */ | ||
1048 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && | 1071 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
1049 | curr->vruntime < se->vruntime) { | 1072 | curr && curr->vruntime < se->vruntime) { |
1050 | /* | 1073 | /* |
1051 | * Upon rescheduling, sched_class::put_prev_task() will place | 1074 | * Upon rescheduling, sched_class::put_prev_task() will place |
1052 | * 'current' within the tree based on its new key value. | 1075 | * 'current' within the tree based on its new key value. |
@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1054 | swap(curr->vruntime, se->vruntime); | 1077 | swap(curr->vruntime, se->vruntime); |
1055 | } | 1078 | } |
1056 | 1079 | ||
1057 | se->peer_preempt = 0; | ||
1058 | enqueue_task_fair(rq, p, 0); | 1080 | enqueue_task_fair(rq, p, 0); |
1059 | resched_task(rq->curr); | 1081 | resched_task(rq->curr); |
1060 | } | 1082 | } |