aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorMichael Wang <wangyun@linux.vnet.ibm.com>2013-07-04 00:55:51 -0400
committerIngo Molnar <mingo@kernel.org>2013-07-23 06:18:41 -0400
commit62470419e993f8d9d93db0effd3af4296ecb79a5 (patch)
tree3702386db7d904f8e5163a812f4d5352bc9b93a1 /kernel/sched
parent685207963be973fbb73550db6edaf920a283e1a7 (diff)
sched: Implement smarter wake-affine logic
The wake-affine scheduler feature is currently always trying to pull the wakee close to the waker. In theory this should be beneficial if the waker's CPU caches hot data for the wakee, and it's also beneficial in the extreme ping-pong high context switch rate case. Testing shows it can benefit hackbench up to 15%. However, the feature is somewhat blind, from which some workloads such as pgbench suffer. It's also time-consuming algorithmically. Testing shows it can damage pgbench up to 50% - far more than the benefit it brings in the best case. So wake-affine should be smarter and it should realize when to stop its thankless effort at trying to find a suitable CPU to wake on. This patch introduces 'wakee_flips', which will be increased each time the task flips (switches) its wakee target. So a high 'wakee_flips' value means the task has more than one wakee, and the bigger the number, the higher the wakeup frequency. Now when making the decision on whether to pull or not, pay attention to the wakee with a high 'wakee_flips', pulling such a task may benefit the wakee. Also imply that the waker will face cruel competition later, it could be very cruel or very fast depends on the story behind 'wakee_flips', waker therefore suffers. Furthermore, if waker also has a high 'wakee_flips', that implies that multiple tasks rely on it, then waker's higher latency will damage all of them, so pulling wakee seems to be a bad deal. Thus, when 'waker->wakee_flips / wakee->wakee_flips' becomes higher and higher, the cost of pulling seems to be worse and worse. The patch therefore helps the wake-affine feature to stop its pulling work when: wakee->wakee_flips > factor && waker->wakee_flips > (factor * wakee->wakee_flips) The 'factor' here is the number of CPUs in the current CPU's NUMA node, so a bigger node will lead to more pulling since the trial becomes more severe. After applying the patch, pgbench shows up to 40% improvements and no regressions. Tested with 12 cpu x86 server and tip 3.10.0-rc7. The percentages in the final column highlight the areas with the biggest wins, all other areas improved as well: pgbench base smart | db_size | clients | tps | | tps | +---------+---------+-------+ +-------+ | 22 MB | 1 | 10598 | | 10796 | | 22 MB | 2 | 21257 | | 21336 | | 22 MB | 4 | 41386 | | 41622 | | 22 MB | 8 | 51253 | | 57932 | | 22 MB | 12 | 48570 | | 54000 | | 22 MB | 16 | 46748 | | 55982 | +19.75% | 22 MB | 24 | 44346 | | 55847 | +25.93% | 22 MB | 32 | 43460 | | 54614 | +25.66% | 7484 MB | 1 | 8951 | | 9193 | | 7484 MB | 2 | 19233 | | 19240 | | 7484 MB | 4 | 37239 | | 37302 | | 7484 MB | 8 | 46087 | | 50018 | | 7484 MB | 12 | 42054 | | 48763 | | 7484 MB | 16 | 40765 | | 51633 | +26.66% | 7484 MB | 24 | 37651 | | 52377 | +39.11% | 7484 MB | 32 | 37056 | | 51108 | +37.92% | 15 GB | 1 | 8845 | | 9104 | | 15 GB | 2 | 19094 | | 19162 | | 15 GB | 4 | 36979 | | 36983 | | 15 GB | 8 | 46087 | | 49977 | | 15 GB | 12 | 41901 | | 48591 | | 15 GB | 16 | 40147 | | 50651 | +26.16% | 15 GB | 24 | 37250 | | 52365 | +40.58% | 15 GB | 32 | 36470 | | 50015 | +37.14% Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51D50057.9000809@linux.vnet.ibm.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c47
1 files changed, 47 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 765d87acdf05..860063a8c849 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3017,6 +3017,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3017 return 0; 3017 return 0;
3018} 3018}
3019 3019
3020static void record_wakee(struct task_struct *p)
3021{
3022 /*
3023 * Rough decay (wiping) for cost saving, don't worry
3024 * about the boundary, really active task won't care
3025 * about the loss.
3026 */
3027 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3028 current->wakee_flips = 0;
3029 current->wakee_flip_decay_ts = jiffies;
3030 }
3031
3032 if (current->last_wakee != p) {
3033 current->last_wakee = p;
3034 current->wakee_flips++;
3035 }
3036}
3020 3037
3021static void task_waking_fair(struct task_struct *p) 3038static void task_waking_fair(struct task_struct *p)
3022{ 3039{
@@ -3037,6 +3054,7 @@ static void task_waking_fair(struct task_struct *p)
3037#endif 3054#endif
3038 3055
3039 se->vruntime -= min_vruntime; 3056 se->vruntime -= min_vruntime;
3057 record_wakee(p);
3040} 3058}
3041 3059
3042#ifdef CONFIG_FAIR_GROUP_SCHED 3060#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3155,6 +3173,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
3155 3173
3156#endif 3174#endif
3157 3175
3176static int wake_wide(struct task_struct *p)
3177{
3178 int factor = nr_cpus_node(cpu_to_node(smp_processor_id()));
3179
3180 /*
3181 * Yeah, it's the switching-frequency, could means many wakee or
3182 * rapidly switch, use factor here will just help to automatically
3183 * adjust the loose-degree, so bigger node will lead to more pull.
3184 */
3185 if (p->wakee_flips > factor) {
3186 /*
3187 * wakee is somewhat hot, it needs certain amount of cpu
3188 * resource, so if waker is far more hot, prefer to leave
3189 * it alone.
3190 */
3191 if (current->wakee_flips > (factor * p->wakee_flips))
3192 return 1;
3193 }
3194
3195 return 0;
3196}
3197
3158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3198static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3159{ 3199{
3160 s64 this_load, load; 3200 s64 this_load, load;
@@ -3164,6 +3204,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3164 unsigned long weight; 3204 unsigned long weight;
3165 int balanced; 3205 int balanced;
3166 3206
3207 /*
3208 * If we wake multiple tasks be careful to not bounce
3209 * ourselves around too much.
3210 */
3211 if (wake_wide(p))
3212 return 0;
3213
3167 idx = sd->wake_idx; 3214 idx = sd->wake_idx;
3168 this_cpu = smp_processor_id(); 3215 this_cpu = smp_processor_id();
3169 prev_cpu = task_cpu(p); 3216 prev_cpu = task_cpu(p);