diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-19 00:27:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-19 00:27:13 -0400 |
commit | 2caf470363941b70212a9a843cae02e8e2f751d9 (patch) | |
tree | 149980ba161b932156c98ee8107594b6f957356f | |
parent | 6c3c3158a81d6a92d335dd27ad9eb43f6b4c664b (diff) | |
parent | 33b0c4217dcd67b788318c3192a2912b530e4eef (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel:
sched: tune multi-core idle balancing
sched: retune wake granularity
sched: wakeup-buddy tasks are cache-hot
sched: improve affine wakeups
sched, net: socket wakeups are sync
sched: clean up wakeup balancing, code flow
sched: clean up wakeup balancing, rename variables
sched: clean up wakeup balancing, move wake_affine()
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | include/linux/topology.h | 1 | ||||
-rw-r--r-- | kernel/sched.c | 11 | ||||
-rw-r--r-- | kernel/sched_debug.c | 1 | ||||
-rw-r--r-- | kernel/sched_fair.c | 191 | ||||
-rw-r--r-- | net/core/sock.c | 4 |
6 files changed, 134 insertions, 77 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 11d8e9a74ef..3625fcaf5d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -929,6 +929,9 @@ struct sched_entity { | |||
929 | u64 vruntime; | 929 | u64 vruntime; |
930 | u64 prev_sum_exec_runtime; | 930 | u64 prev_sum_exec_runtime; |
931 | 931 | ||
932 | u64 last_wakeup; | ||
933 | u64 avg_overlap; | ||
934 | |||
932 | #ifdef CONFIG_SCHEDSTATS | 935 | #ifdef CONFIG_SCHEDSTATS |
933 | u64 wait_start; | 936 | u64 wait_start; |
934 | u64 wait_max; | 937 | u64 wait_max; |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 2352f46160d..2d8dac8799c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -138,7 +138,6 @@ | |||
138 | | SD_BALANCE_FORK \ | 138 | | SD_BALANCE_FORK \ |
139 | | SD_BALANCE_EXEC \ | 139 | | SD_BALANCE_EXEC \ |
140 | | SD_WAKE_AFFINE \ | 140 | | SD_WAKE_AFFINE \ |
141 | | SD_WAKE_IDLE \ | ||
142 | | SD_SHARE_PKG_RESOURCES\ | 141 | | SD_SHARE_PKG_RESOURCES\ |
143 | | BALANCE_FOR_MC_POWER, \ | 142 | | BALANCE_FOR_MC_POWER, \ |
144 | .last_balance = jiffies, \ | 143 | .last_balance = jiffies, \ |
diff --git a/kernel/sched.c b/kernel/sched.c index d1ad69b270c..3f7c5eb254e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1396,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1396 | { | 1396 | { |
1397 | s64 delta; | 1397 | s64 delta; |
1398 | 1398 | ||
1399 | /* | ||
1400 | * Buddy candidates are cache hot: | ||
1401 | */ | ||
1402 | if (&p->se == cfs_rq_of(&p->se)->next) | ||
1403 | return 1; | ||
1404 | |||
1399 | if (p->sched_class != &fair_sched_class) | 1405 | if (p->sched_class != &fair_sched_class) |
1400 | return 0; | 1406 | return 0; |
1401 | 1407 | ||
@@ -1855,10 +1861,11 @@ out_activate: | |||
1855 | schedstat_inc(p, se.nr_wakeups_remote); | 1861 | schedstat_inc(p, se.nr_wakeups_remote); |
1856 | update_rq_clock(rq); | 1862 | update_rq_clock(rq); |
1857 | activate_task(rq, p, 1); | 1863 | activate_task(rq, p, 1); |
1858 | check_preempt_curr(rq, p); | ||
1859 | success = 1; | 1864 | success = 1; |
1860 | 1865 | ||
1861 | out_running: | 1866 | out_running: |
1867 | check_preempt_curr(rq, p); | ||
1868 | |||
1862 | p->state = TASK_RUNNING; | 1869 | p->state = TASK_RUNNING; |
1863 | #ifdef CONFIG_SMP | 1870 | #ifdef CONFIG_SMP |
1864 | if (p->sched_class->task_wake_up) | 1871 | if (p->sched_class->task_wake_up) |
@@ -1892,6 +1899,8 @@ static void __sched_fork(struct task_struct *p) | |||
1892 | p->se.exec_start = 0; | 1899 | p->se.exec_start = 0; |
1893 | p->se.sum_exec_runtime = 0; | 1900 | p->se.sum_exec_runtime = 0; |
1894 | p->se.prev_sum_exec_runtime = 0; | 1901 | p->se.prev_sum_exec_runtime = 0; |
1902 | p->se.last_wakeup = 0; | ||
1903 | p->se.avg_overlap = 0; | ||
1895 | 1904 | ||
1896 | #ifdef CONFIG_SCHEDSTATS | 1905 | #ifdef CONFIG_SCHEDSTATS |
1897 | p->se.wait_start = 0; | 1906 | p->se.wait_start = 0; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4..ef358ba0768 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
288 | PN(se.exec_start); | 288 | PN(se.exec_start); |
289 | PN(se.vruntime); | 289 | PN(se.vruntime); |
290 | PN(se.sum_exec_runtime); | 290 | PN(se.sum_exec_runtime); |
291 | PN(se.avg_overlap); | ||
291 | 292 | ||
292 | nr_switches = p->nvcsw + p->nivcsw; | 293 | nr_switches = p->nvcsw + p->nivcsw; |
293 | 294 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2cc59080ef..b85cac4b5e2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | |||
73 | 73 | ||
74 | /* | 74 | /* |
75 | * SCHED_OTHER wake-up granularity. | 75 | * SCHED_OTHER wake-up granularity. |
76 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | 76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) |
77 | * | 77 | * |
78 | * This option delays the preemption effects of decoupled workloads | 78 | * This option delays the preemption effects of decoupled workloads |
79 | * and reduces their over-scheduling. Synchronous workloads will still | 79 | * and reduces their over-scheduling. Synchronous workloads will still |
80 | * have immediate wakeup/sleep latencies. | 80 | * have immediate wakeup/sleep latencies. |
81 | */ | 81 | */ |
82 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; |
83 | 83 | ||
84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
85 | 85 | ||
@@ -556,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | |||
556 | account_entity_enqueue(cfs_rq, se); | 556 | account_entity_enqueue(cfs_rq, se); |
557 | } | 557 | } |
558 | 558 | ||
559 | static void update_avg(u64 *avg, u64 sample) | ||
560 | { | ||
561 | s64 diff = sample - *avg; | ||
562 | *avg += diff >> 3; | ||
563 | } | ||
564 | |||
565 | static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
566 | { | ||
567 | if (!se->last_wakeup) | ||
568 | return; | ||
569 | |||
570 | update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); | ||
571 | se->last_wakeup = 0; | ||
572 | } | ||
573 | |||
559 | static void | 574 | static void |
560 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 575 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
561 | { | 576 | { |
@@ -566,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
566 | 581 | ||
567 | update_stats_dequeue(cfs_rq, se); | 582 | update_stats_dequeue(cfs_rq, se); |
568 | if (sleep) { | 583 | if (sleep) { |
584 | update_avg_stats(cfs_rq, se); | ||
569 | #ifdef CONFIG_SCHEDSTATS | 585 | #ifdef CONFIG_SCHEDSTATS |
570 | if (entity_is_task(se)) { | 586 | if (entity_is_task(se)) { |
571 | struct task_struct *tsk = task_of(se); | 587 | struct task_struct *tsk = task_of(se); |
@@ -980,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
980 | #endif | 996 | #endif |
981 | 997 | ||
982 | #ifdef CONFIG_SMP | 998 | #ifdef CONFIG_SMP |
983 | static int select_task_rq_fair(struct task_struct *p, int sync) | 999 | |
1000 | static const struct sched_class fair_sched_class; | ||
1001 | |||
1002 | static int | ||
1003 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | ||
1004 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | ||
1005 | int idx, unsigned long load, unsigned long this_load, | ||
1006 | unsigned int imbalance) | ||
984 | { | 1007 | { |
985 | int cpu, this_cpu; | 1008 | struct task_struct *curr = this_rq->curr; |
986 | struct rq *rq; | 1009 | unsigned long tl = this_load; |
987 | struct sched_domain *sd, *this_sd = NULL; | 1010 | unsigned long tl_per_task; |
988 | int new_cpu; | 1011 | |
1012 | if (!(this_sd->flags & SD_WAKE_AFFINE)) | ||
1013 | return 0; | ||
1014 | |||
1015 | /* | ||
1016 | * If the currently running task will sleep within | ||
1017 | * a reasonable amount of time then attract this newly | ||
1018 | * woken task: | ||
1019 | */ | ||
1020 | if (sync && curr->sched_class == &fair_sched_class) { | ||
1021 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | ||
1022 | p->se.avg_overlap < sysctl_sched_migration_cost) | ||
1023 | return 1; | ||
1024 | } | ||
989 | 1025 | ||
990 | cpu = task_cpu(p); | 1026 | schedstat_inc(p, se.nr_wakeups_affine_attempts); |
991 | rq = task_rq(p); | 1027 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
992 | this_cpu = smp_processor_id(); | ||
993 | new_cpu = cpu; | ||
994 | 1028 | ||
995 | if (cpu == this_cpu) | 1029 | /* |
996 | goto out_set_cpu; | 1030 | * If sync wakeup then subtract the (maximum possible) |
1031 | * effect of the currently running task from the load | ||
1032 | * of the current CPU: | ||
1033 | */ | ||
1034 | if (sync) | ||
1035 | tl -= current->se.load.weight; | ||
1036 | |||
1037 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || | ||
1038 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1039 | /* | ||
1040 | * This domain has SD_WAKE_AFFINE and | ||
1041 | * p is cache cold in this domain, and | ||
1042 | * there is no bad imbalance. | ||
1043 | */ | ||
1044 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1045 | schedstat_inc(p, se.nr_wakeups_affine); | ||
997 | 1046 | ||
1047 | return 1; | ||
1048 | } | ||
1049 | return 0; | ||
1050 | } | ||
1051 | |||
1052 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
1053 | { | ||
1054 | struct sched_domain *sd, *this_sd = NULL; | ||
1055 | int prev_cpu, this_cpu, new_cpu; | ||
1056 | unsigned long load, this_load; | ||
1057 | struct rq *rq, *this_rq; | ||
1058 | unsigned int imbalance; | ||
1059 | int idx; | ||
1060 | |||
1061 | prev_cpu = task_cpu(p); | ||
1062 | rq = task_rq(p); | ||
1063 | this_cpu = smp_processor_id(); | ||
1064 | this_rq = cpu_rq(this_cpu); | ||
1065 | new_cpu = prev_cpu; | ||
1066 | |||
1067 | /* | ||
1068 | * 'this_sd' is the first domain that both | ||
1069 | * this_cpu and prev_cpu are present in: | ||
1070 | */ | ||
998 | for_each_domain(this_cpu, sd) { | 1071 | for_each_domain(this_cpu, sd) { |
999 | if (cpu_isset(cpu, sd->span)) { | 1072 | if (cpu_isset(prev_cpu, sd->span)) { |
1000 | this_sd = sd; | 1073 | this_sd = sd; |
1001 | break; | 1074 | break; |
1002 | } | 1075 | } |
1003 | } | 1076 | } |
1004 | 1077 | ||
1005 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1078 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1006 | goto out_set_cpu; | 1079 | goto out; |
1007 | 1080 | ||
1008 | /* | 1081 | /* |
1009 | * Check for affine wakeup and passive balancing possibilities. | 1082 | * Check for affine wakeup and passive balancing possibilities. |
1010 | */ | 1083 | */ |
1011 | if (this_sd) { | 1084 | if (!this_sd) |
1012 | int idx = this_sd->wake_idx; | 1085 | goto out; |
1013 | unsigned int imbalance; | ||
1014 | unsigned long load, this_load; | ||
1015 | |||
1016 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1017 | |||
1018 | load = source_load(cpu, idx); | ||
1019 | this_load = target_load(this_cpu, idx); | ||
1020 | |||
1021 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1022 | |||
1023 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1024 | unsigned long tl = this_load; | ||
1025 | unsigned long tl_per_task; | ||
1026 | |||
1027 | /* | ||
1028 | * Attract cache-cold tasks on sync wakeups: | ||
1029 | */ | ||
1030 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1031 | goto out_set_cpu; | ||
1032 | |||
1033 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1034 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1035 | |||
1036 | /* | ||
1037 | * If sync wakeup then subtract the (maximum possible) | ||
1038 | * effect of the currently running task from the load | ||
1039 | * of the current CPU: | ||
1040 | */ | ||
1041 | if (sync) | ||
1042 | tl -= current->se.load.weight; | ||
1043 | |||
1044 | if ((tl <= load && | ||
1045 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1046 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1047 | /* | ||
1048 | * This domain has SD_WAKE_AFFINE and | ||
1049 | * p is cache cold in this domain, and | ||
1050 | * there is no bad imbalance. | ||
1051 | */ | ||
1052 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1053 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1054 | goto out_set_cpu; | ||
1055 | } | ||
1056 | } | ||
1057 | 1086 | ||
1058 | /* | 1087 | idx = this_sd->wake_idx; |
1059 | * Start passive balancing when half the imbalance_pct | 1088 | |
1060 | * limit is reached. | 1089 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1061 | */ | 1090 | |
1062 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1091 | load = source_load(prev_cpu, idx); |
1063 | if (imbalance*this_load <= 100*load) { | 1092 | this_load = target_load(this_cpu, idx); |
1064 | schedstat_inc(this_sd, ttwu_move_balance); | 1093 | |
1065 | schedstat_inc(p, se.nr_wakeups_passive); | 1094 | if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, |
1066 | goto out_set_cpu; | 1095 | load, this_load, imbalance)) |
1067 | } | 1096 | return this_cpu; |
1097 | |||
1098 | if (prev_cpu == this_cpu) | ||
1099 | goto out; | ||
1100 | |||
1101 | /* | ||
1102 | * Start passive balancing when half the imbalance_pct | ||
1103 | * limit is reached. | ||
1104 | */ | ||
1105 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1106 | if (imbalance*this_load <= 100*load) { | ||
1107 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1108 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1109 | return this_cpu; | ||
1068 | } | 1110 | } |
1069 | } | 1111 | } |
1070 | 1112 | ||
1071 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | 1113 | out: |
1072 | out_set_cpu: | ||
1073 | return wake_idle(new_cpu, p); | 1114 | return wake_idle(new_cpu, p); |
1074 | } | 1115 | } |
1075 | #endif /* CONFIG_SMP */ | 1116 | #endif /* CONFIG_SMP */ |
@@ -1092,6 +1133,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1092 | return; | 1133 | return; |
1093 | } | 1134 | } |
1094 | 1135 | ||
1136 | se->last_wakeup = se->sum_exec_runtime; | ||
1137 | if (unlikely(se == pse)) | ||
1138 | return; | ||
1139 | |||
1095 | cfs_rq_of(pse)->next = pse; | 1140 | cfs_rq_of(pse)->next = pse; |
1096 | 1141 | ||
1097 | /* | 1142 | /* |
diff --git a/net/core/sock.c b/net/core/sock.c index 09cb3a74de7..2654c147c00 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -1621,7 +1621,7 @@ static void sock_def_readable(struct sock *sk, int len) | |||
1621 | { | 1621 | { |
1622 | read_lock(&sk->sk_callback_lock); | 1622 | read_lock(&sk->sk_callback_lock); |
1623 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1623 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
1624 | wake_up_interruptible(sk->sk_sleep); | 1624 | wake_up_interruptible_sync(sk->sk_sleep); |
1625 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); | 1625 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); |
1626 | read_unlock(&sk->sk_callback_lock); | 1626 | read_unlock(&sk->sk_callback_lock); |
1627 | } | 1627 | } |
@@ -1635,7 +1635,7 @@ static void sock_def_write_space(struct sock *sk) | |||
1635 | */ | 1635 | */ |
1636 | if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { | 1636 | if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { |
1637 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1637 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
1638 | wake_up_interruptible(sk->sk_sleep); | 1638 | wake_up_interruptible_sync(sk->sk_sleep); |
1639 | 1639 | ||
1640 | /* Should agree with poll, otherwise some programs break */ | 1640 | /* Should agree with poll, otherwise some programs break */ |
1641 | if (sock_writeable(sk)) | 1641 | if (sock_writeable(sk)) |