aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-03-19 00:27:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-03-19 00:27:13 -0400
commit2caf470363941b70212a9a843cae02e8e2f751d9 (patch)
tree149980ba161b932156c98ee8107594b6f957356f
parent6c3c3158a81d6a92d335dd27ad9eb43f6b4c664b (diff)
parent33b0c4217dcd67b788318c3192a2912b530e4eef (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel: sched: tune multi-core idle balancing sched: retune wake granularity sched: wakeup-buddy tasks are cache-hot sched: improve affine wakeups sched, net: socket wakeups are sync sched: clean up wakeup balancing, code flow sched: clean up wakeup balancing, rename variables sched: clean up wakeup balancing, move wake_affine()
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/topology.h1
-rw-r--r--kernel/sched.c11
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c191
-rw-r--r--net/core/sock.c4
6 files changed, 134 insertions, 77 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11d8e9a74eff..3625fcaf5d0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -929,6 +929,9 @@ struct sched_entity {
929 u64 vruntime; 929 u64 vruntime;
930 u64 prev_sum_exec_runtime; 930 u64 prev_sum_exec_runtime;
931 931
932 u64 last_wakeup;
933 u64 avg_overlap;
934
932#ifdef CONFIG_SCHEDSTATS 935#ifdef CONFIG_SCHEDSTATS
933 u64 wait_start; 936 u64 wait_start;
934 u64 wait_max; 937 u64 wait_max;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2352f46160d3..2d8dac8799cf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -138,7 +138,6 @@
138 | SD_BALANCE_FORK \ 138 | SD_BALANCE_FORK \
139 | SD_BALANCE_EXEC \ 139 | SD_BALANCE_EXEC \
140 | SD_WAKE_AFFINE \ 140 | SD_WAKE_AFFINE \
141 | SD_WAKE_IDLE \
142 | SD_SHARE_PKG_RESOURCES\ 141 | SD_SHARE_PKG_RESOURCES\
143 | BALANCE_FOR_MC_POWER, \ 142 | BALANCE_FOR_MC_POWER, \
144 .last_balance = jiffies, \ 143 .last_balance = jiffies, \
diff --git a/kernel/sched.c b/kernel/sched.c
index d1ad69b270ca..3f7c5eb254e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1396,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1396{ 1396{
1397 s64 delta; 1397 s64 delta;
1398 1398
1399 /*
1400 * Buddy candidates are cache hot:
1401 */
1402 if (&p->se == cfs_rq_of(&p->se)->next)
1403 return 1;
1404
1399 if (p->sched_class != &fair_sched_class) 1405 if (p->sched_class != &fair_sched_class)
1400 return 0; 1406 return 0;
1401 1407
@@ -1855,10 +1861,11 @@ out_activate:
1855 schedstat_inc(p, se.nr_wakeups_remote); 1861 schedstat_inc(p, se.nr_wakeups_remote);
1856 update_rq_clock(rq); 1862 update_rq_clock(rq);
1857 activate_task(rq, p, 1); 1863 activate_task(rq, p, 1);
1858 check_preempt_curr(rq, p);
1859 success = 1; 1864 success = 1;
1860 1865
1861out_running: 1866out_running:
1867 check_preempt_curr(rq, p);
1868
1862 p->state = TASK_RUNNING; 1869 p->state = TASK_RUNNING;
1863#ifdef CONFIG_SMP 1870#ifdef CONFIG_SMP
1864 if (p->sched_class->task_wake_up) 1871 if (p->sched_class->task_wake_up)
@@ -1892,6 +1899,8 @@ static void __sched_fork(struct task_struct *p)
1892 p->se.exec_start = 0; 1899 p->se.exec_start = 0;
1893 p->se.sum_exec_runtime = 0; 1900 p->se.sum_exec_runtime = 0;
1894 p->se.prev_sum_exec_runtime = 0; 1901 p->se.prev_sum_exec_runtime = 0;
1902 p->se.last_wakeup = 0;
1903 p->se.avg_overlap = 0;
1895 1904
1896#ifdef CONFIG_SCHEDSTATS 1905#ifdef CONFIG_SCHEDSTATS
1897 p->se.wait_start = 0; 1906 p->se.wait_start = 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
288 PN(se.exec_start); 288 PN(se.exec_start);
289 PN(se.vruntime); 289 PN(se.vruntime);
290 PN(se.sum_exec_runtime); 290 PN(se.sum_exec_runtime);
291 PN(se.avg_overlap);
291 292
292 nr_switches = p->nvcsw + p->nivcsw; 293 nr_switches = p->nvcsw + p->nivcsw;
293 294
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2cc59080efa..b85cac4b5e25 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73 73
74/* 74/*
75 * SCHED_OTHER wake-up granularity. 75 * SCHED_OTHER wake-up granularity.
76 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 77 *
78 * This option delays the preemption effects of decoupled workloads 78 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 79 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 80 * have immediate wakeup/sleep latencies.
81 */ 81 */
82unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 82unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
83 83
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 84const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 85
@@ -556,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
556 account_entity_enqueue(cfs_rq, se); 556 account_entity_enqueue(cfs_rq, se);
557} 557}
558 558
559static void update_avg(u64 *avg, u64 sample)
560{
561 s64 diff = sample - *avg;
562 *avg += diff >> 3;
563}
564
565static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
566{
567 if (!se->last_wakeup)
568 return;
569
570 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
571 se->last_wakeup = 0;
572}
573
559static void 574static void
560dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 575dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
561{ 576{
@@ -566,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
566 581
567 update_stats_dequeue(cfs_rq, se); 582 update_stats_dequeue(cfs_rq, se);
568 if (sleep) { 583 if (sleep) {
584 update_avg_stats(cfs_rq, se);
569#ifdef CONFIG_SCHEDSTATS 585#ifdef CONFIG_SCHEDSTATS
570 if (entity_is_task(se)) { 586 if (entity_is_task(se)) {
571 struct task_struct *tsk = task_of(se); 587 struct task_struct *tsk = task_of(se);
@@ -980,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
980#endif 996#endif
981 997
982#ifdef CONFIG_SMP 998#ifdef CONFIG_SMP
983static int select_task_rq_fair(struct task_struct *p, int sync) 999
1000static const struct sched_class fair_sched_class;
1001
1002static int
1003wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1004 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1005 int idx, unsigned long load, unsigned long this_load,
1006 unsigned int imbalance)
984{ 1007{
985 int cpu, this_cpu; 1008 struct task_struct *curr = this_rq->curr;
986 struct rq *rq; 1009 unsigned long tl = this_load;
987 struct sched_domain *sd, *this_sd = NULL; 1010 unsigned long tl_per_task;
988 int new_cpu; 1011
1012 if (!(this_sd->flags & SD_WAKE_AFFINE))
1013 return 0;
1014
1015 /*
1016 * If the currently running task will sleep within
1017 * a reasonable amount of time then attract this newly
1018 * woken task:
1019 */
1020 if (sync && curr->sched_class == &fair_sched_class) {
1021 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1022 p->se.avg_overlap < sysctl_sched_migration_cost)
1023 return 1;
1024 }
989 1025
990 cpu = task_cpu(p); 1026 schedstat_inc(p, se.nr_wakeups_affine_attempts);
991 rq = task_rq(p); 1027 tl_per_task = cpu_avg_load_per_task(this_cpu);
992 this_cpu = smp_processor_id();
993 new_cpu = cpu;
994 1028
995 if (cpu == this_cpu) 1029 /*
996 goto out_set_cpu; 1030 * If sync wakeup then subtract the (maximum possible)
1031 * effect of the currently running task from the load
1032 * of the current CPU:
1033 */
1034 if (sync)
1035 tl -= current->se.load.weight;
1036
1037 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1038 100*(tl + p->se.load.weight) <= imbalance*load) {
1039 /*
1040 * This domain has SD_WAKE_AFFINE and
1041 * p is cache cold in this domain, and
1042 * there is no bad imbalance.
1043 */
1044 schedstat_inc(this_sd, ttwu_move_affine);
1045 schedstat_inc(p, se.nr_wakeups_affine);
997 1046
1047 return 1;
1048 }
1049 return 0;
1050}
1051
1052static int select_task_rq_fair(struct task_struct *p, int sync)
1053{
1054 struct sched_domain *sd, *this_sd = NULL;
1055 int prev_cpu, this_cpu, new_cpu;
1056 unsigned long load, this_load;
1057 struct rq *rq, *this_rq;
1058 unsigned int imbalance;
1059 int idx;
1060
1061 prev_cpu = task_cpu(p);
1062 rq = task_rq(p);
1063 this_cpu = smp_processor_id();
1064 this_rq = cpu_rq(this_cpu);
1065 new_cpu = prev_cpu;
1066
1067 /*
1068 * 'this_sd' is the first domain that both
1069 * this_cpu and prev_cpu are present in:
1070 */
998 for_each_domain(this_cpu, sd) { 1071 for_each_domain(this_cpu, sd) {
999 if (cpu_isset(cpu, sd->span)) { 1072 if (cpu_isset(prev_cpu, sd->span)) {
1000 this_sd = sd; 1073 this_sd = sd;
1001 break; 1074 break;
1002 } 1075 }
1003 } 1076 }
1004 1077
1005 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1078 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1006 goto out_set_cpu; 1079 goto out;
1007 1080
1008 /* 1081 /*
1009 * Check for affine wakeup and passive balancing possibilities. 1082 * Check for affine wakeup and passive balancing possibilities.
1010 */ 1083 */
1011 if (this_sd) { 1084 if (!this_sd)
1012 int idx = this_sd->wake_idx; 1085 goto out;
1013 unsigned int imbalance;
1014 unsigned long load, this_load;
1015
1016 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1017
1018 load = source_load(cpu, idx);
1019 this_load = target_load(this_cpu, idx);
1020
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */
1022
1023 if (this_sd->flags & SD_WAKE_AFFINE) {
1024 unsigned long tl = this_load;
1025 unsigned long tl_per_task;
1026
1027 /*
1028 * Attract cache-cold tasks on sync wakeups:
1029 */
1030 if (sync && !task_hot(p, rq->clock, this_sd))
1031 goto out_set_cpu;
1032
1033 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1034 tl_per_task = cpu_avg_load_per_task(this_cpu);
1035
1036 /*
1037 * If sync wakeup then subtract the (maximum possible)
1038 * effect of the currently running task from the load
1039 * of the current CPU:
1040 */
1041 if (sync)
1042 tl -= current->se.load.weight;
1043
1044 if ((tl <= load &&
1045 tl + target_load(cpu, idx) <= tl_per_task) ||
1046 100*(tl + p->se.load.weight) <= imbalance*load) {
1047 /*
1048 * This domain has SD_WAKE_AFFINE and
1049 * p is cache cold in this domain, and
1050 * there is no bad imbalance.
1051 */
1052 schedstat_inc(this_sd, ttwu_move_affine);
1053 schedstat_inc(p, se.nr_wakeups_affine);
1054 goto out_set_cpu;
1055 }
1056 }
1057 1086
1058 /* 1087 idx = this_sd->wake_idx;
1059 * Start passive balancing when half the imbalance_pct 1088
1060 * limit is reached. 1089 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1061 */ 1090
1062 if (this_sd->flags & SD_WAKE_BALANCE) { 1091 load = source_load(prev_cpu, idx);
1063 if (imbalance*this_load <= 100*load) { 1092 this_load = target_load(this_cpu, idx);
1064 schedstat_inc(this_sd, ttwu_move_balance); 1093
1065 schedstat_inc(p, se.nr_wakeups_passive); 1094 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1066 goto out_set_cpu; 1095 load, this_load, imbalance))
1067 } 1096 return this_cpu;
1097
1098 if (prev_cpu == this_cpu)
1099 goto out;
1100
1101 /*
1102 * Start passive balancing when half the imbalance_pct
1103 * limit is reached.
1104 */
1105 if (this_sd->flags & SD_WAKE_BALANCE) {
1106 if (imbalance*this_load <= 100*load) {
1107 schedstat_inc(this_sd, ttwu_move_balance);
1108 schedstat_inc(p, se.nr_wakeups_passive);
1109 return this_cpu;
1068 } 1110 }
1069 } 1111 }
1070 1112
1071 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1113out:
1072out_set_cpu:
1073 return wake_idle(new_cpu, p); 1114 return wake_idle(new_cpu, p);
1074} 1115}
1075#endif /* CONFIG_SMP */ 1116#endif /* CONFIG_SMP */
@@ -1092,6 +1133,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1092 return; 1133 return;
1093 } 1134 }
1094 1135
1136 se->last_wakeup = se->sum_exec_runtime;
1137 if (unlikely(se == pse))
1138 return;
1139
1095 cfs_rq_of(pse)->next = pse; 1140 cfs_rq_of(pse)->next = pse;
1096 1141
1097 /* 1142 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 09cb3a74de7f..2654c147c004 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1621,7 +1621,7 @@ static void sock_def_readable(struct sock *sk, int len)
1621{ 1621{
1622 read_lock(&sk->sk_callback_lock); 1622 read_lock(&sk->sk_callback_lock);
1623 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 1623 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1624 wake_up_interruptible(sk->sk_sleep); 1624 wake_up_interruptible_sync(sk->sk_sleep);
1625 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 1625 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1626 read_unlock(&sk->sk_callback_lock); 1626 read_unlock(&sk->sk_callback_lock);
1627} 1627}
@@ -1635,7 +1635,7 @@ static void sock_def_write_space(struct sock *sk)
1635 */ 1635 */
1636 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 1636 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1637 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 1637 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1638 wake_up_interruptible(sk->sk_sleep); 1638 wake_up_interruptible_sync(sk->sk_sleep);
1639 1639
1640 /* Should agree with poll, otherwise some programs break */ 1640 /* Should agree with poll, otherwise some programs break */
1641 if (sock_writeable(sk)) 1641 if (sock_writeable(sk))