aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c205
1 files changed, 118 insertions, 87 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2cc59080efa..86a93376282c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73 73
74/* 74/*
75 * SCHED_OTHER wake-up granularity. 75 * SCHED_OTHER wake-up granularity.
76 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 77 *
78 * This option delays the preemption effects of decoupled workloads 78 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 79 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 80 * have immediate wakeup/sleep latencies.
81 */ 81 */
82unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 82unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
83 83
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 84const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 85
@@ -302,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
302 return vslice; 302 return vslice;
303} 303}
304 304
305static u64 sched_vslice(struct cfs_rq *cfs_rq)
306{
307 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
308}
309
310static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 305static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
311{ 306{
312 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 307 return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -504,15 +499,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
504 } else 499 } else
505 vruntime = cfs_rq->min_vruntime; 500 vruntime = cfs_rq->min_vruntime;
506 501
507 if (sched_feat(TREE_AVG)) {
508 struct sched_entity *last = __pick_last_entity(cfs_rq);
509 if (last) {
510 vruntime += last->vruntime;
511 vruntime >>= 1;
512 }
513 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
514 vruntime += sched_vslice(cfs_rq)/2;
515
516 /* 502 /*
517 * The 'current' period is already promised to the current tasks, 503 * The 'current' period is already promised to the current tasks,
518 * however the extra weight of the new task will slow them down a 504 * however the extra weight of the new task will slow them down a
@@ -556,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
556 account_entity_enqueue(cfs_rq, se); 542 account_entity_enqueue(cfs_rq, se);
557} 543}
558 544
545static void update_avg(u64 *avg, u64 sample)
546{
547 s64 diff = sample - *avg;
548 *avg += diff >> 3;
549}
550
551static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
552{
553 if (!se->last_wakeup)
554 return;
555
556 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
557 se->last_wakeup = 0;
558}
559
559static void 560static void
560dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 561dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
561{ 562{
@@ -566,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
566 567
567 update_stats_dequeue(cfs_rq, se); 568 update_stats_dequeue(cfs_rq, se);
568 if (sleep) { 569 if (sleep) {
570 update_avg_stats(cfs_rq, se);
569#ifdef CONFIG_SCHEDSTATS 571#ifdef CONFIG_SCHEDSTATS
570 if (entity_is_task(se)) { 572 if (entity_is_task(se)) {
571 struct task_struct *tsk = task_of(se); 573 struct task_struct *tsk = task_of(se);
@@ -980,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
980#endif 982#endif
981 983
982#ifdef CONFIG_SMP 984#ifdef CONFIG_SMP
983static int select_task_rq_fair(struct task_struct *p, int sync) 985
986static const struct sched_class fair_sched_class;
987
988static int
989wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
990 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
991 int idx, unsigned long load, unsigned long this_load,
992 unsigned int imbalance)
984{ 993{
985 int cpu, this_cpu; 994 struct task_struct *curr = this_rq->curr;
986 struct rq *rq; 995 unsigned long tl = this_load;
987 struct sched_domain *sd, *this_sd = NULL; 996 unsigned long tl_per_task;
988 int new_cpu; 997
998 if (!(this_sd->flags & SD_WAKE_AFFINE))
999 return 0;
1000
1001 /*
1002 * If the currently running task will sleep within
1003 * a reasonable amount of time then attract this newly
1004 * woken task:
1005 */
1006 if (sync && curr->sched_class == &fair_sched_class) {
1007 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1008 p->se.avg_overlap < sysctl_sched_migration_cost)
1009 return 1;
1010 }
1011
1012 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1013 tl_per_task = cpu_avg_load_per_task(this_cpu);
1014
1015 /*
1016 * If sync wakeup then subtract the (maximum possible)
1017 * effect of the currently running task from the load
1018 * of the current CPU:
1019 */
1020 if (sync)
1021 tl -= current->se.load.weight;
1022
1023 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1024 100*(tl + p->se.load.weight) <= imbalance*load) {
1025 /*
1026 * This domain has SD_WAKE_AFFINE and
1027 * p is cache cold in this domain, and
1028 * there is no bad imbalance.
1029 */
1030 schedstat_inc(this_sd, ttwu_move_affine);
1031 schedstat_inc(p, se.nr_wakeups_affine);
989 1032
990 cpu = task_cpu(p); 1033 return 1;
991 rq = task_rq(p); 1034 }
992 this_cpu = smp_processor_id(); 1035 return 0;
993 new_cpu = cpu; 1036}
994 1037
995 if (cpu == this_cpu) 1038static int select_task_rq_fair(struct task_struct *p, int sync)
996 goto out_set_cpu; 1039{
1040 struct sched_domain *sd, *this_sd = NULL;
1041 int prev_cpu, this_cpu, new_cpu;
1042 unsigned long load, this_load;
1043 struct rq *rq, *this_rq;
1044 unsigned int imbalance;
1045 int idx;
1046
1047 prev_cpu = task_cpu(p);
1048 rq = task_rq(p);
1049 this_cpu = smp_processor_id();
1050 this_rq = cpu_rq(this_cpu);
1051 new_cpu = prev_cpu;
997 1052
1053 /*
1054 * 'this_sd' is the first domain that both
1055 * this_cpu and prev_cpu are present in:
1056 */
998 for_each_domain(this_cpu, sd) { 1057 for_each_domain(this_cpu, sd) {
999 if (cpu_isset(cpu, sd->span)) { 1058 if (cpu_isset(prev_cpu, sd->span)) {
1000 this_sd = sd; 1059 this_sd = sd;
1001 break; 1060 break;
1002 } 1061 }
1003 } 1062 }
1004 1063
1005 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1064 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1006 goto out_set_cpu; 1065 goto out;
1007 1066
1008 /* 1067 /*
1009 * Check for affine wakeup and passive balancing possibilities. 1068 * Check for affine wakeup and passive balancing possibilities.
1010 */ 1069 */
1011 if (this_sd) { 1070 if (!this_sd)
1012 int idx = this_sd->wake_idx; 1071 goto out;
1013 unsigned int imbalance;
1014 unsigned long load, this_load;
1015
1016 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1017
1018 load = source_load(cpu, idx);
1019 this_load = target_load(this_cpu, idx);
1020
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */
1022
1023 if (this_sd->flags & SD_WAKE_AFFINE) {
1024 unsigned long tl = this_load;
1025 unsigned long tl_per_task;
1026
1027 /*
1028 * Attract cache-cold tasks on sync wakeups:
1029 */
1030 if (sync && !task_hot(p, rq->clock, this_sd))
1031 goto out_set_cpu;
1032
1033 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1034 tl_per_task = cpu_avg_load_per_task(this_cpu);
1035
1036 /*
1037 * If sync wakeup then subtract the (maximum possible)
1038 * effect of the currently running task from the load
1039 * of the current CPU:
1040 */
1041 if (sync)
1042 tl -= current->se.load.weight;
1043
1044 if ((tl <= load &&
1045 tl + target_load(cpu, idx) <= tl_per_task) ||
1046 100*(tl + p->se.load.weight) <= imbalance*load) {
1047 /*
1048 * This domain has SD_WAKE_AFFINE and
1049 * p is cache cold in this domain, and
1050 * there is no bad imbalance.
1051 */
1052 schedstat_inc(this_sd, ttwu_move_affine);
1053 schedstat_inc(p, se.nr_wakeups_affine);
1054 goto out_set_cpu;
1055 }
1056 }
1057 1072
1058 /* 1073 idx = this_sd->wake_idx;
1059 * Start passive balancing when half the imbalance_pct 1074
1060 * limit is reached. 1075 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1061 */ 1076
1062 if (this_sd->flags & SD_WAKE_BALANCE) { 1077 load = source_load(prev_cpu, idx);
1063 if (imbalance*this_load <= 100*load) { 1078 this_load = target_load(this_cpu, idx);
1064 schedstat_inc(this_sd, ttwu_move_balance); 1079
1065 schedstat_inc(p, se.nr_wakeups_passive); 1080 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1066 goto out_set_cpu; 1081 load, this_load, imbalance))
1067 } 1082 return this_cpu;
1083
1084 if (prev_cpu == this_cpu)
1085 goto out;
1086
1087 /*
1088 * Start passive balancing when half the imbalance_pct
1089 * limit is reached.
1090 */
1091 if (this_sd->flags & SD_WAKE_BALANCE) {
1092 if (imbalance*this_load <= 100*load) {
1093 schedstat_inc(this_sd, ttwu_move_balance);
1094 schedstat_inc(p, se.nr_wakeups_passive);
1095 return this_cpu;
1068 } 1096 }
1069 } 1097 }
1070 1098
1071 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1099out:
1072out_set_cpu:
1073 return wake_idle(new_cpu, p); 1100 return wake_idle(new_cpu, p);
1074} 1101}
1075#endif /* CONFIG_SMP */ 1102#endif /* CONFIG_SMP */
@@ -1092,6 +1119,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1092 return; 1119 return;
1093 } 1120 }
1094 1121
1122 se->last_wakeup = se->sum_exec_runtime;
1123 if (unlikely(se == pse))
1124 return;
1125
1095 cfs_rq_of(pse)->next = pse; 1126 cfs_rq_of(pse)->next = pse;
1096 1127
1097 /* 1128 /*