From 58b26c4c025778c09c7a1438ff185080e11b7d0a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Sep 2010 18:19:17 -0700 Subject: sched: Increment cache_nice_tries only on periodic lb scheduler uses cache_nice_tries as an indicator to do cache_hot and active load balance, when normal load balance fails. Currently, this value is changed on any failed load balance attempt. That ends up being not so nice to workloads that enter/exit idle often, as they do more frequent new_idle balance and that pretty soon results in cache hot tasks being pulled in. Making the cache_nice_tries ignore failed new_idle balance seems to make better sense. With that only the failed load balance in periodic load balance gets accounted and the rate of accumulation of cache_nice_tries will not depend on idle entry/exit (short running sleep-wakeup kind of tasks). This reduces movement of cache_hot tasks. schedstat diff (after-before) excerpt from a workload that has frequent and short wakeup-idle pattern (:2 in cpu col below refers to NEWIDLE idx) This snapshot was across ~400 seconds. Without this change: domainstats: domain0 cpu cnt bln fld imb gain hgain nobusyq nobusyg 0:2 306487 219575 73167 110069413 44583 19070 1172 218403 1:2 292139 194853 81421 120893383 50745 21902 1259 193594 2:2 283166 174607 91359 129699642 54931 23688 1287 173320 3:2 273998 161788 93991 132757146 57122 24351 1366 160422 4:2 289851 215692 62190 83398383 36377 13680 851 214841 5:2 316312 222146 77605 117582154 49948 20281 988 221158 6:2 297172 195596 83623 122133390 52801 21301 929 194667 7:2 283391 178078 86378 126622761 55122 22239 928 177150 8:2 297655 210359 72995 110246694 45798 19777 1125 209234 9:2 297357 202011 79363 119753474 50953 22088 1089 200922 10:2 278797 178703 83180 122514385 52969 22726 1128 177575 11:2 272661 167669 86978 127342327 55857 24342 1195 166474 12:2 293039 204031 73211 110282059 47285 19651 948 203083 13:2 289502 196762 76803 114712942 49339 20547 1016 195746 14:2 264446 169609 78292 115715605 50459 21017 982 168627 15:2 260968 163660 80142 116811793 51483 21281 1064 162596 With this change: domainstats: domain0 cpu cnt bln fld imb gain hgain nobusyq nobusyg 0:2 272347 187380 77455 105420270 24975 1 953 186427 1:2 267276 172360 86234 116242264 28087 6 1028 171332 2:2 259769 156777 93281 123243134 30555 1 1043 155734 3:2 250870 143129 97627 127370868 32026 6 1188 141941 4:2 248422 177116 64096 78261112 22202 2 757 176359 5:2 275595 180683 84950 116075022 29400 6 778 179905 6:2 262418 162609 88944 119256898 31056 4 817 161792 7:2 252204 147946 92646 122388300 32879 4 824 147122 8:2 262335 172239 81631 110477214 26599 4 864 171375 9:2 261563 164775 88016 117203621 28331 3 849 163926 10:2 243389 140949 93379 121353071 29585 2 909 140040 11:2 242795 134651 98310 124768957 30895 2 1016 133635 12:2 255234 166622 79843 104696912 26483 4 746 165876 13:2 244944 151595 83855 109808099 27787 3 801 150794 14:2 241301 140982 89935 116954383 30403 6 845 140137 15:2 232271 128564 92821 119185207 31207 4 1416 127148 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1284167957-3675-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a171138a9402..aa16cf1eb8fe 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3031,7 +3031,14 @@ redo: if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), this_cpu)) { -- cgit v1.2.2 From b0a0f667a349247bd7f05f806b662a25653822bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Oct 2010 17:32:51 -0700 Subject: sched: suppress RCU lockdep splat in task_fork_fair > =================================================== > [ INFO: suspicious rcu_dereference_check() usage. ] > --------------------------------------------------- > /home/greearb/git/linux.wireless-testing/kernel/sched.c:618 invoked rcu_dereference_check() without protection! > > other info that might help us debug this: > > rcu_scheduler_active = 1, debug_locks = 1 > 1 lock held by ifup/23517: > #0: (&rq->lock){-.-.-.}, at: [] task_fork_fair+0x3b/0x108 > > stack backtrace: > Pid: 23517, comm: ifup Not tainted 2.6.36-rc6-wl+ #5 > Call Trace: > [] ? printk+0xf/0x16 > [] lockdep_rcu_dereference+0x74/0x7d > [] task_group+0x6d/0x79 > [] set_task_rq+0xe/0x57 > [] task_fork_fair+0x57/0x108 > [] sched_fork+0x82/0xf9 > [] copy_process+0x569/0xe8e > [] do_fork+0x118/0x262 > [] ? do_page_fault+0x16a/0x2cf > [] ? up_read+0x16/0x2a > [] sys_clone+0x1b/0x20 > [] ptregs_clone+0x15/0x30 > [] ? sysenter_do_call+0x12/0x38 Here a newly created task is having its runqueue assigned. The new task is not yet on the tasklist, so cannot go away. This is therefore a false positive, suppress with an RCU read-side critical section. Reported-by: Ben Greear Tested-by: Ben Greear Date: Thu, 14 Oct 2010 16:09:13 +0900 Subject: sched: Comment updates: fix default latency and granularity numbers Targeted preemption latency and minimal preemption granularity for CPU-bound tasks have been changed. This patch updates the comments about these values. Signed-off-by: Takuya Yoshikawa Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20101014160913.eb24fef4.yoshikawa.takuya@oss.ntt.co.jp> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 623e9aceef8f..bf87192e97fe 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,7 +25,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -- cgit v1.2.2 From 2582f0eba54066b5e98ff2b27ef0cfa833b59f54 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 13 Oct 2010 12:09:36 -0700 Subject: sched: Set group_imb only a task can be pulled from the busiest cpu When cycling through sched groups to determine the busiest group, set group_imb only if the busiest cpu has more than 1 runnable task. This patch fixes the case where two cpus in a group have one runnable task each, but there is a large weight differential between these two tasks. The load balancer is unable to migrate any task from this group, and hence do not consider this group to be imbalanced. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1286996978-7007-3-git-send-email-ncrao@google.com> [ small code readability edits ] Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bf87192e97fe..3656480e0f79 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2378,7 +2378,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load; + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; @@ -2389,6 +2389,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; + max_nr_running = 0; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); @@ -2406,8 +2407,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) + if (load > max_cpu_load) { max_cpu_load = load; + max_nr_running = rq->nr_running; + } if (min_cpu_load > load) min_cpu_load = load; } @@ -2447,11 +2450,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = - DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); } -- cgit v1.2.2 From fab476228ba37907ad75216d0fd9732ada9c119e Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Fri, 15 Oct 2010 13:12:29 -0700 Subject: sched: Force balancing on newidle balance if local group has capacity This patch forces a load balance on a newly idle cpu when the local group has extra capacity and the busiest group does not have any. It improves system utilization when balancing tasks with a large weight differential. Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. With this patch, if the local group has extra capacity, we shortcut the checks in f_b_g() and try to pull a task over. A sched group has extra capacity if the group capacity is greater than the number of running tasks in that group. Thanks to Mike Galbraith for discussions leading to this patch and for the insight to reuse SD_NEWIDLE_BALANCE. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-4-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3656480e0f79..032b548be0fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); + + /* re-arm NEWIDLE balancing when moving tasks */ + src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; + this_rq->idle_stamp = 0; } /* @@ -2030,12 +2034,14 @@ struct sd_lb_stats { unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; + unsigned long this_has_capacity; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2058,6 +2064,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ }; /** @@ -2456,6 +2463,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; } /** @@ -2554,12 +2564,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; sds->group_imb = sgs.group_imb; } @@ -2756,6 +2768,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } + /******* find_busiest_group() helpers end here *********************/ /** @@ -2807,6 +2820,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. + * + * Note: when doing newidle balance, if the local group has excess + * capacity (i.e. nr_running < group_capacity) and the busiest group + * does not have any capacity, we force a load balance to pull tasks + * to the local group. In this case, we skip past checks 3, 4 and 5. */ if (!(*balance)) goto ret; @@ -2818,6 +2836,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + if (sds.this_load >= sds.max_load) goto out_balanced; @@ -2829,6 +2852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; +force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -3162,10 +3186,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } raw_spin_lock(&this_rq->lock); -- cgit v1.2.2 From 75dd321d79d495a0ee579e6249ebc38ddbb2667f Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Fri, 15 Oct 2010 13:12:30 -0700 Subject: sched: Drop group_capacity to 1 only if local group has extra capacity When SD_PREFER_SIBLING is set on a sched domain, drop group_capacity to 1 only if the local group has extra capacity. The extra check prevents the case where you always pull from the heaviest group when it is already under-utilized (possible with a large weight task outweighs the tasks on the system). For example, consider a 16-cpu quad-core quad-socket machine with MC and NUMA scheduling domains. Let's say we spawn 15 nice0 tasks and one nice-15 task, and each task is running on one core. In this case, we observe the following events when balancing at the NUMA domain: - find_busiest_group() will always pick the sched group containing the niced task to be the busiest group. - find_busiest_queue() will then always pick one of the cpus running the nice0 task (never picks the cpu with the nice -15 task since weighted_cpuload > imbalance). - The load balancer fails to migrate the task since it is the running task and increments sd->nr_balance_failed. - It repeats the above steps a few more times until sd->nr_balance_failed > 5, at which point it kicks off the active load balancer, wakes up the migration thread and kicks the nice 0 task off the cpu. The load balancer doesn't stop until we kick out all nice 0 tasks from the sched group, leaving you with 3 idle cpus and one cpu running the nice -15 task. When balancing at the NUMA domain, we drop sgs.group_capacity to 1 if the child domain (in this case MC) has SD_PREFER_SIBLING set. Subsequent load checks are not relevant because the niced task has a very large weight. In this patch, we add an extra condition to the "if(prefer_sibling)" check in update_sd_lb_stats(). We drop the capacity of a group only if the local group has extra capacity, ie. nr_running < group_capacity. This patch preserves the original intent of the prefer_siblings check (to spread tasks across the system in low utilization scenarios) and fixes the case above. It helps in the following ways: - In low utilization cases (where nr_tasks << nr_cpus), we still drop group_capacity down to 1 if we prefer siblings. - On very busy systems (where nr_tasks >> nr_cpus), sgs.nr_running will most likely be > sgs.group_capacity. - When balancing large weight tasks, if the local group does not have extra capacity, we do not pick the group with the niced task as the busiest group. This prevents failed balances, active migration and the under-utilization described above. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-5-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 032b548be0fc..f1c615ff39d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2554,9 +2554,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try - * and move all the excess tasks away. + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling) + if (prefer_sibling && !local_group && sds->this_has_capacity) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { -- cgit v1.2.2 From 305e6835e05513406fa12820e40e4a8ecb63743c Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:21 -0700 Subject: sched: Do not account irq time to current task Scheduler accounts both softirq and interrupt processing times to the currently running task. This means, if the interrupt processing was for some other task in the system, then the current task ends up being penalized as it gets shorter runtime than otherwise. Change sched task accounting to acoount only actual task time from currently running task. Now update_curr(), modifies the delta_exec to depend on rq->clock_task. Note that this change only handles CONFIG_IRQ_TIME_ACCOUNTING case. We can extend this to CONFIG_VIRT_CPU_ACCOUNTING with minimal effort. But, thats for later. This change will impact scheduling behavior in interrupt heavy conditions. Tested on a 4-way system with eth0 handled by CPU 2 and a network heavy task (nc) running on CPU 3 (and no RSS/RFS). With that I have CPU 2 spending 75%+ of its time in irq processing. CPU 3 spending around 35% time running nc task. Now, if I run another CPU intensive task on CPU 2, without this change /proc//schedstat shows 100% of time accounted to this task. With this change, it rightly shows less than 25% accounted to this task as remaining time is actually spent on irq processing. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-7-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f1c615ff39d6..c358d4081b81 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock; + se->exec_start = rq_of(cfs_rq)->clock_task; } /************************************************** @@ -1802,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock, sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.2 From aa483808516ca5cacfa0e5849691f64fec25828e Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:22 -0700 Subject: sched: Remove irq time from available CPU power The idea was suggested by Peter Zijlstra here: http://marc.info/?l=linux-kernel&m=127476934517534&w=2 irq time is technically not available to the tasks running on the CPU. This patch removes irq time from CPU power piggybacking on sched_rt_avg_update(). Tested this by keeping CPU X busy with a network intensive task having 75% oa a single CPU irq processing (hard+soft) on a 4-way system. And start seven cycle soakers on the system. Without this change, there will be two tasks on each CPU. With this change, there is a single task on irq busy CPU X and remaining 7 tasks are spread around among other 3 CPUs. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-8-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c358d4081b81..74cccfae87a8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2275,7 +2275,13 @@ unsigned long scale_rt_power(int cpu) u64 total, available; total = sched_avg_period() + (rq->clock - rq->age_stamp); - available = total - rq->rt_avg; + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; -- cgit v1.2.2 From b2b5ce022acf5e9f52f7b78c5579994fdde191d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Oct 2010 15:24:15 +0200 Subject: sched, cgroup: Fixup broken cgroup movement Dima noticed that we fail to correct the ->vruntime of sleeping tasks when we move them between cgroups. Reported-by: Dima Zavin Signed-off-by: Peter Zijlstra Tested-by: Mike Galbraith LKML-Reference: <1287150604.29097.1513.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 74cccfae87a8..3acc2a487c18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3866,13 +3866,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3924,7 +3937,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; -- cgit v1.2.2 From aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 17 Sep 2010 15:02:32 -0700 Subject: sched: Use group weight, idle cpu metrics to fix imbalances during idle Currently we consider a sched domain to be well balanced when the imbalance is less than the domain's imablance_pct. As the number of cores and threads are increasing, current values of imbalance_pct (for example 25% for a NUMA domain) are not enough to detect imbalances like: a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads), 24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another socket. Leading to an idle HT cpu. b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and 16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one socket and 7 on another socket. Leaving one core in a socket idle whereas in another socket we have a core having both its HT siblings busy. While this issue can be fixed by decreasing the domain's imbalance_pct (by making it a function of number of logical cpus in the domain), it can potentially cause more task migrations across sched groups in an overloaded case. Fix this by using imbalance_pct only during newly_idle and busy load balancing. And during idle load balancing, check if there is an imbalance in number of idle cpu's across the busiest and this sched_group or if the busiest group has more tasks than its weight that the idle cpu in this_group can pull. Reported-by: Nikhil Rao Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..034c4f410b36 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2035,13 +2035,16 @@ struct sd_lb_stats { unsigned long this_load_per_task; unsigned long this_nr_running; unsigned long this_has_capacity; + unsigned int this_idle_cpus; /* Statistics of the busiest group */ + unsigned int busiest_idle_cpus; unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; unsigned long busiest_has_capacity; + unsigned int busiest_group_weight; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2063,6 +2066,8 @@ struct sg_lb_stats { unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; + unsigned long idle_cpus; + unsigned long group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ }; @@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; sgs->sum_weighted_load += weighted_cpuload(i); - + if (idle_cpu(i)) + sgs->idle_cpus++; } /* @@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; @@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; sds->this_has_capacity = sgs.group_has_capacity; + sds->this_idle_cpus = sgs.idle_cpus; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_idle_cpus = sgs.idle_cpus; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->busiest_has_capacity = sgs.group_has_capacity; + sds->busiest_group_weight = sgs.group_weight; sds->group_imb = sgs.group_imb; } @@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; + /* + * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. + * And to check for busy balance use !idle_cpu instead of + * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE + * even when they are idle. + */ + if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; + } else { + /* + * This cpu is idle. If the busiest group load doesn't + * have more tasks than the number of available cpu's and + * there is no imbalance between this and busiest group + * wrt to idle cpu's, it is balanced. + */ + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + sds.busiest_nr_running <= sds.busiest_group_weight) + goto out_balanced; + } force_balance: /* Looks like there is an imbalance. Compute it */ -- cgit v1.2.2 From 1e5a74059f9059d330744eac84873b1b99657008 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 31 Oct 2010 12:37:04 +0100 Subject: sched: Fix cross-sched-class wakeup preemption Instead of dealing with sched classes inside each check_preempt_curr() implementation, pull out this logic into the generic wakeup preemption path. This fixes a hang in KVM (and others) where we are waiting for the stop machine thread to run ... Reported-by: Markus Trippelsdorf Tested-by: Marcelo Tosatti Tested-by: Sergey Senozhatsky Signed-off-by: Peter Zijlstra LKML-Reference: <1288891946.2039.31.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 034c4f410b36..52ab113d8bb9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; - if (unlikely(rt_prio(p->prio))) - goto preempt; - - if (unlikely(p->sched_class != &fair_sched_class)) - return; - if (unlikely(se == pse)) return; -- cgit v1.2.2 From b5482cfa1c95a188b3054fa33274806add91bbe5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 16 Nov 2010 17:34:02 +0800 Subject: sched: Fix volanomark performance regression Commit fab4762 triggers excessive idle balancing, causing a ~30% loss in volanomark throughput. Remove idle balancing throttle reset. Originally-by: Alex Shi Signed-off-by: Mike Galbraith Acked-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1289928732.5169.211.camel@maggy.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 52ab113d8bb9..ba0556dc7c06 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1758,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); - - /* re-arm NEWIDLE balancing when moving tasks */ - src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; - this_rq->idle_stamp = 0; } /* -- cgit v1.2.2 From d5ad140bc1505a98c0f040937125bfcbb508078f Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 17 Nov 2010 11:42:04 -0800 Subject: sched: Fix idle balancing An earlier commit reverts idle balancing throttling reset to fix a 30% regression in volanomark throughput. We still need to reset idle_stamp when we pull a task in newidle balance. Reported-by: Alex Shi Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1290022924-3548-1-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba0556dc7c06..00ebd7686676 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3215,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } raw_spin_lock(&this_rq->lock); -- cgit v1.2.2 From 2069dd75c7d0f49355939e5586daf5a9ab216db7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:00 -0800 Subject: sched: Rewrite tg_shares_up) By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.580480400@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 164 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 107 insertions(+), 57 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..d86544b4151c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_load(struct cfs_rq *cfs_rq) +{ + u64 period = sched_avg_period(); + u64 now, delta; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + cfs_rq->load_stamp = now; + cfs_rq->load_period += delta; + cfs_rq->load_avg += delta * cfs_rq->load.weight; + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) + account_entity_dequeue(cfs_rq, se); + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void update_cfs_load(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3156,8 +3206,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; -- cgit v1.2.2 From 3d4b47b4b040c9d77dd68104cfc1055d89a55afd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:01 -0800 Subject: sched: Implement on-demand (active) cfs_rq list Make certain load-balance actions scale per number of active cgroups instead of the number of existing cgroups. This makes wakeup/sleep paths more expensive, but is a win for systems where the vast majority of existing cgroups are idle. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.666535048@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d86544b4151c..0560e72bd732 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -143,6 +143,24 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return cfs_rq->tg->cfs_rq[this_cpu]; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +264,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return &cpu_rq(this_cpu)->cfs; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -648,7 +674,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq) +static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { u64 period = sched_avg_period(); u64 now, delta; @@ -673,6 +699,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq) cfs_rq->load_period /= 2; cfs_rq->load_avg /= 2; } + + if (lb && !cfs_rq->nr_running) { + if (cfs_rq->load_avg < (period / 8)) + list_del_leaf_cfs_rq(cfs_rq); + } } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -719,7 +750,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_load(struct cfs_rq *cfs_rq) +static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { } @@ -849,7 +880,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -863,6 +894,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -907,7 +941,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); @@ -1142,7 +1176,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } @@ -1172,7 +1206,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } -- cgit v1.2.2 From 9e3081ca61147b29f52fddb4f7c6b6b82ea5eb7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:02 -0800 Subject: sched: Make tg_shares_up() walk on-demand Make tg_shares_up() use the active cgroup list, this means we cannot do a strict bottom-up walk of the hierarchy, but assuming its a very wide tree with a small number of active groups it should be a win. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.754159484@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0560e72bd732..46ff6587dc16 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2004,6 +2004,60 @@ out: } #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int tg_shares_up(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + long load_avg; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct task_group *tg = cfs_rq->tg; + + do { + tg_shares_up(tg, cpu); + tg = tg->parent; + } while (tg); + } + rcu_read_unlock(); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2051,6 +2105,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return max_load_move - rem_load_move; } #else +static inline void update_shares(int cpu) +{ +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit v1.2.2 From f0d7442a5924a802b66eef79b3708f77297bfb35 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:03 -0800 Subject: sched: Fix load corruption from update_cfs_shares() As part of enqueue_entity both a new entity weight and its contribution to the queuing cfs_rq / rq are updated. Since update_cfs_shares will only update the queueing weights when the entity is on_rq (which in this case it is not yet), there's a dependency loop here: update_cfs_shares needs account_entity_enqueue to update cfs_rq->load.weight account_entity_enqueue needs the updated weight for the queuing cfs_rq load[*] Fix this and avoid spurious dequeue/enqueues by issuing update_cfs_shares as if we had accounted the enqueue already. This was also resulting in rq->load corruption previously. [*]: this dependency also exists when using the group cfs_rq w/ update_cfs_shares as the weight of the enqueued entity changes without the load being updated. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.844900206@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 46ff6587dc16..d52b97a04e7a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -718,7 +718,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; @@ -732,7 +732,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) if (!se) return; - load = cfs_rq->load.weight; + load = cfs_rq->load.weight + weight_delta; load_weight = atomic_read(&tg->load_weight); load_weight -= cfs_rq->load_contribution; @@ -754,7 +754,7 @@ static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -881,8 +881,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -944,7 +944,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -1177,7 +1177,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); } hrtick_update(rq); @@ -1207,7 +1207,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); } hrtick_update(rq); @@ -2034,7 +2034,7 @@ static int tg_shares_up(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.2 From e33078baa4d30ad1d0e46d1f62b9e5a63a3e6ee3 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:04 -0800 Subject: sched: Fix update_cfs_load() synchronization Using cfs_rq->nr_running is not sufficient to synchronize update_cfs_load with the put path since nr_running accounting occurs at deactivation. It's also not safe to make the removal decision based on load_avg as this fails with both high periods and low shares. Resolve this by clipping history after 4 periods without activity. Note: the above will always occur from update_shares() since in the last-task-sleep-case that task will still be cfs_rq->curr when update_cfs_load is called. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.933428187@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d52b97a04e7a..a543a5b202a4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -674,10 +674,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) +static void update_cfs_load(struct cfs_rq *cfs_rq) { u64 period = sched_avg_period(); u64 now, delta; + unsigned long load = cfs_rq->load.weight; if (!cfs_rq) return; @@ -685,9 +686,19 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) now = rq_of(cfs_rq)->clock; delta = now - cfs_rq->load_stamp; + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + } + cfs_rq->load_stamp = now; cfs_rq->load_period += delta; - cfs_rq->load_avg += delta * cfs_rq->load.weight; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } while (cfs_rq->load_period > period) { /* @@ -700,10 +711,8 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) cfs_rq->load_avg /= 2; } - if (lb && !cfs_rq->nr_running) { - if (cfs_rq->load_avg < (period / 8)) - list_del_leaf_cfs_rq(cfs_rq); - } + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -750,7 +759,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) +static inline void update_cfs_load(struct cfs_rq *cfs_rq) { } @@ -880,7 +889,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); @@ -941,7 +950,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq, 0); @@ -1176,7 +1185,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, 0); } @@ -1206,7 +1215,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, 0); } @@ -2023,7 +2032,7 @@ static int tg_shares_up(struct task_group *tg, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + update_cfs_load(cfs_rq); load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); load_avg -= cfs_rq->load_contribution; -- cgit v1.2.2 From 67e86250f8ea7b8f7da53ac25ea73c6bd71f5cd9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:05 -0800 Subject: sched: Introduce hierarchal order on shares update list Avoid duplicate shares update calls by ensuring children always appear before parents in rq->leaf_cfs_rq_list. This allows us to do a single in-order traversal for update_shares(). Since we always enqueue in bottom-up order this reduces to 2 cases: 1) Our parent is already in the list, e.g. root \ b /\ c d* (root->b->c already enqueued) Since d's parent is enqueued we push it to the head of the list, implicitly ahead of b. 2) Our parent does not appear in the list (or we have no parent) In this case we enqueue to the tail of the list, if our parent is subsequently enqueued (bottom-up) it will appear to our right by the same rule. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234938.022488865@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a543a5b202a4..b320753aa6c9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -146,8 +146,20 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, &rq_of(cfs_rq)->leaf_cfs_rq_list); + } cfs_rq->on_list = 1; } @@ -2016,7 +2028,7 @@ out: /* * update tg->load_weight by folding this cpu's load_avg */ -static int tg_shares_up(struct task_group *tg, int cpu) +static int update_shares_cpu(struct task_group *tg, int cpu) { struct cfs_rq *cfs_rq; unsigned long flags; @@ -2056,14 +2068,8 @@ static void update_shares(int cpu) struct rq *rq = cpu_rq(cpu); rcu_read_lock(); - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct task_group *tg = cfs_rq->tg; - - do { - tg_shares_up(tg, cpu); - tg = tg->parent; - } while (tg); - } + for_each_leaf_cfs_rq(rq, cfs_rq) + update_shares_cpu(cfs_rq->tg, cpu); rcu_read_unlock(); } -- cgit v1.2.2 From a7a4f8a752ec734b2eab904fc863d5dc873de338 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:06 -0800 Subject: sched: Add sysctl_sched_shares_window Introduce a new sysctl for the shares window and disambiguate it from sched_time_avg. A 10ms window appears to be a good compromise between accuracy and performance. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234938.112173964@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b320753aa6c9..6c84439ce987 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + static const struct sched_class fair_sched_class; /************************************************************** @@ -688,7 +695,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED static void update_cfs_load(struct cfs_rq *cfs_rq) { - u64 period = sched_avg_period(); + u64 period = sysctl_sched_shares_window; u64 now, delta; unsigned long load = cfs_rq->load.weight; -- cgit v1.2.2 From c66eaf619c0c7937e9ded160ae83b5a7a6b19b56 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:07 -0800 Subject: sched: Update shares on idle_balance Since shares updates are no longer expensive and effectively local, update them at idle_balance(). This allows us to more quickly redistribute shares to another cpu when our load becomes idle. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234938.204191702@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c84439ce987..33f941dcf88c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3343,6 +3343,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); + update_shares(this_cpu); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; -- cgit v1.2.2 From 3b3d190ec3683d568fd2ebaead5e1ec7f97b6e37 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:08 -0800 Subject: sched: Implement demand based update_cfs_load() When the system is busy, dilation of rq->next_balance makes lb->update_shares() insufficiently frequent for threads which don't sleep (no dequeue/enqueue updates). Adjust for this by making demand based updates based on the accumulation of execution time sufficient to wrap our averaging window. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234938.291159744@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 33f941dcf88c..e7e2f08e6d01 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -539,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +static void update_cfs_load(struct cfs_rq *cfs_rq); +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -558,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq, 0); + } +#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -713,6 +724,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq) } cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; cfs_rq->load_period += delta; if (load) { cfs_rq->load_last = now; -- cgit v1.2.2 From d6b5591829bd348a5fbe1c428d28dea00621cdba Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:09 -0800 Subject: sched: Allow update_cfs_load() to update global load Refactor the global load updates from update_shares_cpu() so that update_cfs_load() can update global load when it is more than ~10% out of sync. The new global_load parameter allows us to force an update, regardless of the error factor so that we can synchronize w/ update_shares(). Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234938.377473595@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e7e2f08e6d01..390ce30ff2d0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -539,7 +539,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq); +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); /* @@ -565,7 +565,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->load_unacc_exec_time += delta_exec; if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq, 0); } #endif @@ -704,7 +704,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq) +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { u64 period = sysctl_sched_shares_window; u64 now, delta; @@ -731,6 +746,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq) cfs_rq->load_avg += delta * load; } + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + while (cfs_rq->load_period > period) { /* * Inline assembly required to prevent the compiler @@ -790,7 +810,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_load(struct cfs_rq *cfs_rq) +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } @@ -920,7 +940,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); @@ -981,7 +1001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq, 0); @@ -1216,7 +1236,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq, 0); } @@ -1246,7 +1266,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq, 0); } @@ -2052,7 +2072,6 @@ static int update_shares_cpu(struct task_group *tg, int cpu) struct cfs_rq *cfs_rq; unsigned long flags; struct rq *rq; - long load_avg; if (!tg->se[cpu]) return 0; @@ -2063,12 +2082,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq); - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; + update_cfs_load(cfs_rq, 1); /* * We need to update shares after updating tg->load_weight in -- cgit v1.2.2 From 70caf8a6c13c2279b35f2ad6b644815533d6c476 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 20 Nov 2010 00:53:51 +0100 Subject: sched: Fix UP build breakage The recent cgroup-scheduling rework caused a UP build problem. Cc: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 390ce30ff2d0..82fd884b4e33 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -562,7 +562,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); -#ifdef CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED cfs_rq->load_unacc_exec_time += delta_exec; if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); -- cgit v1.2.2 From 43365bd7ff37979d2afdccbe953299ed64a4649b Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 15 Dec 2010 19:10:17 -0800 Subject: sched: Move periodic share updates to entity_tick() Long running entities that do not block (dequeue) require periodic updates to maintain accurate share values. (Note: group entities with several threads are quite likely to be non-blocking in many circumstances). By virtue of being long-running however, we will see entity ticks (otherwise the required update occurs in dequeue/put and we are done). Thus we can move the detection (and associated work) for these updates into the periodic path. This restores the 'atomicity' of update_curr() with respect to accounting. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101216031038.067028969@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c88671718bc9..16ee398d8a4e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -564,10 +564,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED cfs_rq->load_unacc_exec_time += delta_exec; - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } #endif } @@ -809,6 +805,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) reweight_entity(cfs_rq_of(se), se, shares); } + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { @@ -817,6 +821,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { } + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} #endif /* CONFIG_FAIR_GROUP_SCHED */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1133,6 +1141,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Update share accounting for long-running entities. + */ + update_entity_shares_tick(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother -- cgit v1.2.2 From 19e5eebb8eaa5ca3ff8aa18cb57ccb7a9f67277d Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 15 Dec 2010 19:10:18 -0800 Subject: sched: Fix interactivity bug by charging unaccounted run-time on entity re-weight Mike Galbraith reported poor interactivity[*] when the new shares distribution code was combined with autogroups. The root cause turns out to be a mis-ordering of accounting accrued execution time and shares updates. Since update_curr() is issued hierarchically, updating the parent entity weights to reflect child enqueue/dequeue results in the parent's unaccounted execution time then being accrued (vs vruntime) at the new weight as opposed to the weight present at accumulation. While this doesn't have much effect on processes with timeslices that cross a tick, it is particularly problematic for an interactive process (e.g. Xorg) which incurs many (tiny) timeslices. In this scenario almost all updates are at dequeue which can result in significant fairness perturbation (especially if it is the only thread, resulting in potential {tg->shares, MIN_SHARES} transitions). Correct this by ensuring unaccounted time is accumulated prior to manipulating an entity's weight. [*] http://xkcd.com/619/ is perversely Nostradamian here. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Linus Torvalds LKML-Reference: <20101216031038.159704378@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 16ee398d8a4e..c62ebae65cf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -765,8 +765,12 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { - if (se->on_rq) + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); + } update_load_set(&se->load, weight); -- cgit v1.2.2 From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 14 Jan 2011 17:57:50 -0800 Subject: sched: Update effective_load() to use global share weights Previously effective_load would approximate the global load weight present on a group taking advantage of: entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided by tg_shares_up. This worked (approximately) for an 'empty' (at tg level) cpu since we would place boost load representative of what a newly woken task would receive. However, now that load is instantaneously updated this assumption is no longer true and the load calculation is rather incorrect in this case. Fix this (and improve the general case) by re-writing effective_load to take advantage of the new shares distribution code. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110115015817.069769529@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae65cf0..414145cf5344 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } -- cgit v1.2.2 From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 5 Jan 2011 05:41:17 +0100 Subject: sched: Fix signed unsigned comparison in check_preempt_tick() Signed unsigned comparison may lead to superfluous resched if leftmost is right of the current task, wasting a few cycles, and inadvertently _lengthening_ the current task's slice. Reported-by: Venkatesh Pallipadi Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294202477.9384.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 414145cf5344..77e9166d7bbf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } -- cgit v1.2.2 From 3ff6dcac735704824c1dff64dc6863c390d364cc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 24 Jan 2011 15:33:52 +0800 Subject: sched: Fix poor interactivity on UP systems due to group scheduler nice tune bug Michael Witten and Christian Kujau reported that the autogroup scheduling feature hurts interactivity on their UP systems. It turns out that this is an older bug in the group scheduling code, and the wider appeal provided by the autogroup feature exposed it more prominently. When on UP with FAIR_GROUP_SCHED enabled, tune shares only affect tg->shares, but is not reflected in tg->se->load. The reason is that update_cfs_shares() does nothing on UP. So introduce update_cfs_shares() for UP && FAIR_GROUP_SCHED. This issue was found when enable autogroup scheduling was enabled, but it is an older bug that also exists on cgroup.cpu on UP. Reported-and-Tested-by: Michael Witten Reported-and-Tested-by: Christian Kujau Signed-off-by: Yong Zhang Acked-by: Pekka Enberg Acked-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: <20110124073352.GA24186@windriver.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 78 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 77e9166d7bbf..354769979c02 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) { @@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; - long load_weight, load, shares; + long shares; if (!cfs_rq) return; @@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) return; - - load = cfs_rq->load.weight + weight_delta; - - load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; - load_weight += load; - - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg, weight_delta); reweight_entity(cfs_rq_of(se), se, shares); } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } -} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { -- cgit v1.2.2 From e37b6a7b27b400c3aa488db8c6629a05095bc79c Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:44:59 -0800 Subject: sched: Fix sign under-flows in wake_affine While care is taken around the zero-point in effective_load to not exceed the instantaneous rq->weight, it's still possible (e.g. using wake_idx != 0) for (load + effective_load) to underflow. In this case the comparing the unsigned values can result in incorrect balanced decisions. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.734245014@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 354769979c02..ccecfec02a70 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1432,7 +1432,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - unsigned long this_load, load; + s64 this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; struct task_group *tg; @@ -1471,8 +1471,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load) { - unsigned long this_eff_load, prev_eff_load; + if (this_load > 0) { + s64 this_eff_load, prev_eff_load; this_eff_load = 100; this_eff_load *= power_of(prev_cpu); -- cgit v1.2.2 From b815f1963e47b9b69bb17e0588bd5af5b1114ae0 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:00 -0800 Subject: sched: Fix/remove redundant cfs_rq checks Since updates are against an entity's queuing cfs_rq it's not possible to enter update_cfs_{shares,load} with a NULL cfs_rq. (Indeed, update_cfs_load would crash prior to the check if we did anyway since we load is examined during the initializers). Also, in the update_cfs_load case there's no point in maintaining averages for rq->cfs_rq since we don't perform shares distribution at that level -- NULL check is replaced accordingly. Thanks to Dan Carpenter for pointing out the deference before NULL check. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.825284940@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ccecfec02a70..1997383ba4d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -722,7 +722,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (!cfs_rq) + if (cfs_rq->tg == &root_task_group) return; now = rq_of(cfs_rq)->clock; @@ -830,9 +830,6 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) struct sched_entity *se; long shares; - if (!cfs_rq) - return; - tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) -- cgit v1.2.2 From 05ca62c6ca17f39b88fa956d5ebc1fa6e93ad5e3 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:02 -0800 Subject: sched: Use rq->clock_task instead of rq->clock for correctly maintaining load averages The delta in clock_task is a more fair attribution of how much time a tg has been contributing load to the current cpu. While not really important it also means we're more in sync (by magnitude) with respect to periodic updates (since __update_curr deltas are clock_task based). Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.007092349@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1997383ba4d6..0c26e2df450e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -725,7 +725,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) if (cfs_rq->tg == &root_task_group) return; - now = rq_of(cfs_rq)->clock; + now = rq_of(cfs_rq)->clock_task; delta = now - cfs_rq->load_stamp; /* truncate load history at 4 idle periods */ -- cgit v1.2.2 From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:01 -0800 Subject: sched: Simplify update_cfs_shares parameters Re-visiting this: Since update_cfs_shares will now only ever re-weight an entity that is a relative parent of the current entity in enqueue_entity; we can safely issue the account_entity_enqueue relative to that cfs_rq and avoid the requirement for special handling of the enqueue case in update_cfs_shares. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.915214637@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..0c550c841eee 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.2 From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:03 -0800 Subject: sched: Avoid expensive initial update_cfs_load() Since cfs->{load_stamp,load_last} are zero-initalized the initial load update will consider the delta to be 'since the beginning of time'. This results in a lot of pointless divisions to bring this large period to be within the sysctl_sched_shares_window. Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this allows for an initial load_stamp > load_last which then lets standard idle truncation proceed. We avoid spinning (and slightly improve consistency) by fixing delta to be [period - 1] in this path resulting in a slightly more predictable shares ramp. (Previously the amount of idle time preserved by the overflow would range between [period/2,period-1].) Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c550c841eee..4cbc9121094c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; -- cgit v1.2.2 From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 17:03:27 +0100 Subject: sched: Fix switch_from_fair() When a task is taken out of the fair class we must ensure the vruntime is properly normalized because when we put it back in it will assume to be normalized. The case that goes wrong is when changing away from the fair class while sleeping. Sleeping tasks have non-normalized vruntime in order to make sleeper-fairness work. So treat the switch away from fair as a wakeup and preserve the relative vruntime. Also update sysrq-n to call the ->switch_{to,from} methods. Reported-by: Onkalo Samu Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4cbc9121094c..55040f3938d8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, -- cgit v1.2.2 From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:47:15 -0500 Subject: sched: Check the right ->nr_running in yield_task_fair() With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq. Yielding to a task from another cfs_rq may be worthwhile, since a process calling yield typically cannot use the CPU right now. Therefor, we want to check the per-cpu nr_running, not the cgroup local one. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 55040f3938d8..4de9905228c4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ - if (unlikely(cfs_rq->nr_running == 1)) + if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); -- cgit v1.2.2 From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:48:37 -0500 Subject: sched: Limit the scope of clear_buddies The clear_buddies function does not seem to play well with the concept of hierarchical runqueues. In the following tree, task groups are represented by 'G', tasks by 'T', next by 'n' and last by 'l'. (nl) / \ G(nl) G / \ \ T(l) T(n) T This situation can arise when a task is woken up T(n), and the previously running task T(l) is marked last. When clear_buddies is called from either T(l) or T(n), the next and last buddies of the group G(nl) will be cleared. This is not the desired result, since we would like to be able to find the other type of buddy in many cases. This especially a worry when implementing yield_task_fair through the buddy system. The fix is simple: only clear the buddy type that the task itself is indicated to be. As an added bonus, we stop walking up the tree when the buddy has already been cleared or pointed elsewhere. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4de9905228c4..a785e08202cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); } static void -- cgit v1.2.2 From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:03 -0500 Subject: sched: Use a buddy to implement yield_task_fair() Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 148 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 88 insertions(+), 60 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a785e08202cf..c0fbeb992833 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8; */ unsigned int sysctl_sched_child_runs_first __read_mostly; -/* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - /* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) @@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se) } } +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) @@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) if (cfs_rq->next == se) __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) @@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) } } +static void set_skip_buddy(struct sched_entity *se) +{ + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: -- cgit v1.2.2 From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 1 Feb 2011 09:50:51 -0500 Subject: sched: Add yield_to(task, preempt) functionality Currently only implemented for fair class tasks. Add a yield_to_task method() to the fair scheduling class. allowing the caller of yield_to() to accelerate another thread in it's thread group, task group. Implemented via a scheduler hint, using cfs_rq->next to encourage the target being selected. We can rely on pick_next_entity to keep things fair, so noone can accelerate a thread that has already used its fair share of CPU time. This also means callers should only call yield_to when they really mean it. Calling it too often can result in the scheduler just ignoring the hint. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c0fbeb992833..027024694043 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ + if (preempt) + resched_task(rq->curr); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, -- cgit v1.2.2 From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 14 Feb 2011 14:38:50 -0800 Subject: sched: Wholesale removal of sd_idle logic sd_idle logic was introduced way back in 2005 (commit 5969fe06), as an HT optimization. As per the discussion in the thread here: lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1 https://patchwork.kernel.org/patch/532501/ The capacity based logic in the load balancer right now handles this in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle does not seem to bring any additional benefits. sd_idle logic also has some bugs that has performance impact. Here is the patch that removes the sd_idle logic altogether. Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle logic. Signed-off-by: Venkatesh Pallipadi Acked-by: Vaidyanathan Srinivasan Signed-off-by: Peter Zijlstra LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 53 +++++++++++------------------------------------------ 1 file changed, 11 insertions(+), 42 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 027024694043..d384e739ea95 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); /* Cases where imbalance does not exist from POV of this_cpu */ /* 1) this_cpu is not the appropriate cpu to perform load balancing @@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3392,8 +3370,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3448,10 +3425,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3465,11 +3438,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } -- cgit v1.2.2 From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:52:53 +0100 Subject: sched: Clean up remnants of sd_idle With the wholesale removal of the sd_idle SMT logic we can clean up some more. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d384e739ea95..cd18600a8a63 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } -- cgit v1.2.2 From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:55:32 +0100 Subject: sched: Clean up some f_b_g() comments The existing comment tends to grow state (as it already has), split it up and place it near the actual tests. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cd18600a8a63..03496ebc4553 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load) goto out_balanced; -- cgit v1.2.2 From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:56:47 +0100 Subject: sched: Fix the group_imb logic On a 2*6*2 machine something like: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' _should_ result in 9 busy CPUs, each running 1 task. However it didn't quite work reliably, most of the time one cpu of the second socket (6-11) would be idle and one cpu of the first socket (0-5) would have two tasks on it. The group_imb logic is supposed to deal with this and detect when a particular group is imbalanced (like in our case, 0-2 are idle but 3-5 will have 4 tasks on it). The detection phase needed a bit of a tweak as it was too weak and required more than 2 avg weight tasks difference between idle and busy cpus in the group which won't trigger for our test-case. So cure that to be one or more avg task weight difference between cpus. Once the detection phase worked, it was then defeated by the f_b_g() tests trying to avoid ping-pongs. In particular, this_load >= max_load triggered because the pulling cpu (the (first) idle cpu in on the second socket, say 6) would find this_load to be 5 and max_load to be 4 (there'd be 5 tasks running on our socket and only 4 on the other socket). Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03496ebc4553..3a88dee165c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); @@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) -- cgit v1.2.2 From a2f5c9ab79f78e8b91ac993e0543d65b661dd19b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 22 Feb 2011 13:04:33 -0800 Subject: sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and ensure idle tasks don't preempt idle tasks) so the non-interactive, but still important, SCHED_BATCH tasks will run in favor of the very low priority SCHED_IDLE tasks. Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Richard Purdie LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3a88dee165c0..1438e13cf8be 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; -- cgit v1.2.2 From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 1 Mar 2011 16:28:21 -0800 Subject: sched: Resched proper CPU on yield_to() yield_to_task_fair() has code to resched the CPU of yielding task when the intention is to resched the CPU of the task that is being yielded to. Change here fixes the problem and also makes the resched conditional on rq != p_rq. Signed-off-by: Venkatesh Pallipadi Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1438e13cf8be..3f7ec9e27ee1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp /* Tell the scheduler that we'd really like pse to run next. */ set_next_buddy(se); - /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ - if (preempt) - resched_task(rq->curr); - yield_task_fair(rq); return true; -- cgit v1.2.2 From 3436ae1298cb22d722a6520fc97f112dd767a9e1 Mon Sep 17 00:00:00 2001 From: Sisir Koppaka Date: Sat, 26 Mar 2011 18:22:55 +0530 Subject: sched: Fix rebalance interval calculation The interval for checking scheduling domains if they are due to be balanced currently depends on boot state NR_CPUS, which may not accurately reflect the number of online CPUs at the time of check. Thus replace NR_CPUS with num_online_cpus(). (ed: Should only affect those who set NR_CPUS really high, such as 4096 or so :-) Signed-off-by: Sisir Koppaka Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e27ee1..c7ec5c8e7b44 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -3850,8 +3851,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + if (interval > HZ*num_online_cpus()/10) + interval = HZ*num_online_cpus()/10; need_serialize = sd->flags & SD_SERIALIZE; -- cgit v1.2.2 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e27ee1..3cb7f07887a1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3061,7 +3061,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have + * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ -- cgit v1.2.2 From 49c022e657fbe661460d191fbe776a387132e2b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 10:14:25 +0200 Subject: sched: Clean up rebalance_domains() load-balance interval calculation Instead of the possible multiple-evaluation of num_online_cpus() in rebalance_domains() that Linus reported, avoid it altogether in the normal case since it's implemented with a Hamming weight function over a cpu bitmask which can be darn expensive for those with big iron. This also makes it cleaner, smaller and documents the code. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra LKML-Reference: <1301991265.2225.12.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c7ec5c8e7b44..80ecd09452e0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3820,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -3849,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*num_online_cpus()/10) - interval = HZ*num_online_cpus()/10; + interval = clamp(interval, 1UL, max_load_balance_interval); need_serialize = sd->flags & SD_SERIALIZE; -- cgit v1.2.2 From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 7 Apr 2011 17:23:22 -0700 Subject: sched: Fix sched-domain avg_load calculation In function find_busiest_group(), the sched-domain avg_load isn't calculated at all if there is a group imbalance within the domain. This will cause erroneous imbalance calculation. The reason is that calculate_imbalance() sees sds->avg_load = 0 and it will dump entire sds->max_load into imbalance variable, which is used later on to migrate entire load from busiest CPU to the puller CPU. This has two really bad effect: 1. stampede of task migration, and they won't be able to break out of the bad state because of positive feedback loop: large load delta -> heavier load migration -> larger imbalance and the cycle goes on. 2. severe imbalance in CPU queue depth. This causes really long scheduling latency blip which affects badly on application that has tight latency requirement. The fix is to have kernel calculate domain avg_load in both cases. This will ensure that imbalance calculation is always sensible and the target is usually half way between busiest and puller CPU. Signed-off-by: Ken Chen Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7f00772e57c9..60f9d407c5ec 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + /* * If the busiest group is imbalanced the below checks don't * work because they assumes all things are equal, which typically @@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Don't pull any tasks if this group is already above the domain * average load. */ - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; -- cgit v1.2.2 From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Fri, 8 Apr 2011 12:20:16 -0700 Subject: sched: Fix erroneous all_pinned logic The scheduler load balancer has specific code to deal with cases of unbalanced system due to lots of unmovable tasks (for example because of hard CPU affinity). In those situation, it excludes the busiest CPU that has pinned tasks for load balance consideration such that it can perform second 2nd load balance pass on the rest of the system. This all works as designed if there is only one cgroup in the system. However, when we have multiple cgroups, this logic has false positives and triggers multiple load balance passes despite there are actually no pinned tasks at all. The reason it has false positives is that the all pinned logic is deep in the lowest function of can_migrate_task() and is too low level: load_balance_fair() iterates each task group and calls balance_tasks() to migrate target load. Along the way, balance_tasks() will also set a all_pinned variable. Given that task-groups are iterated, this all_pinned variable is essentially the status of last group in the scanning process. Task group can have number of reasons that no load being migrated, none due to cpu affinity. However, this status bit is being propagated back up to the higher level load_balance(), which incorrectly think that no tasks were moved. It kick off the all pinned logic and start multiple passes attempt to move load onto puller CPU. To fix this, move the all_pinned aggregation up at the iterator level. This ensures that the status is aggregated over all task-groups, not just last one in the list. Signed-off-by: Ken Chen Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 60f9d407c5ec..6fa833ab2cb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2153,9 +2152,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -3341,6 +3337,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, -- cgit v1.2.2 From f4ad9bd208c98f32a6f9136618e0b8bebe3fb370 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Apr 2011 12:53:09 +0800 Subject: sched: Eliminate dead code from wakeup_gran() calc_delta_fair() checks NICE_0_LOAD already, delete duplicate check. Signed-off-by: Shaohua Li Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1302238389.3981.92.camel@sli10-conroe Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb8..4ee50f0af8d1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1789,10 +1789,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - - return gran; + return calc_delta_fair(gran, se); } /* -- cgit v1.2.2 From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:50 +0200 Subject: sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d1..4a8ac7c2a18e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3890,6 +3907,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. -- cgit v1.2.2 From a6c75f2f8d988ecfecf971f98f1cb6fc4de522fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:52 +0200 Subject: sched: Avoid using sd->level Don't use sd->level for identifying properties of the domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.350174079@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4a8ac7c2a18e..9c5679cfe3b0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2651,7 +2651,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_LOAD_SCALE */ - if (sd->level != SD_LV_SIBLING) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* -- cgit v1.2.2 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d1..96b2c95ac356 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1657,7 +1657,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); -- cgit v1.2.2 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 96b2c95ac356..ad4c414f456d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + lockdep_assert_held(&task_rq(p)->lock); + se->vruntime -= cfs_rq->min_vruntime; } -- cgit v1.2.2 From 3fe1698b7fe05aeb063564e71e40d09f28d8e80c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:48 +0200 Subject: sched: Deal with non-atomic min_vruntime reads on 32bits In order to avoid reading partial updated min_vruntime values on 32bit implement a seqcount like solution. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.111378493@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ad4c414f456d..054cebb81f7b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } /* @@ -1376,10 +1380,21 @@ static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; - lockdep_assert_held(&task_rq(p)->lock); +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; - se->vruntime -= cfs_rq->min_vruntime; + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.2 From 69c80f3e9d3c569f8a3cee94ba1a324b5a7fa6b9 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Wed, 13 Apr 2011 18:21:09 -0700 Subject: sched: Make set_*_buddy() work on non-task entities Make set_*_buddy() work on non-task sched_entity, to facilitate the use of next_buddy to cache a group entity in cases where one of the tasks within that entity sleeps or gets preempted. set_skip_buddy() was incorrectly comparing the policy of task that is yielding to be not equal to SCHED_IDLE. Yielding should happen even when task yielding is SCHED_IDLE. This change removes the policy check on the yielding task. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302744070-30079-2-git-send-email-venki@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 87445931a179..501ab637b94b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1846,26 +1846,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; } static void set_next_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; } static void set_skip_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; - } + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; } /* -- cgit v1.2.2 From 2f36825b176f67e5c5228aa33d828bc39718811f Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 14 Apr 2011 10:30:53 -0700 Subject: sched: Next buddy hint on sleep and preempt path When a task in a taskgroup sleeps, pick_next_task starts all the way back at the root and picks the task/taskgroup with the min vruntime across all runnable tasks. But when there are many frequently sleeping tasks across different taskgroups, it makes better sense to stay with same taskgroup for its slice period (or until all tasks in the taskgroup sleeps) instead of switching cross taskgroup on each sleep after a short runtime. This helps specifically where taskgroups corresponds to a process with multiple threads. The change reduces the number of CR3 switches in this case. Example: Two taskgroups with 2 threads each which are running for 2ms and sleeping for 1ms. Looking at sched:sched_switch shows: BEFORE: taskgroup_1 threads [5004, 5005], taskgroup_2 threads [5016, 5017] cpu-soaker-5004 [003] 3683.391089 cpu-soaker-5016 [003] 3683.393106 cpu-soaker-5005 [003] 3683.395119 cpu-soaker-5017 [003] 3683.397130 cpu-soaker-5004 [003] 3683.399143 cpu-soaker-5016 [003] 3683.401155 cpu-soaker-5005 [003] 3683.403168 cpu-soaker-5017 [003] 3683.405170 AFTER: taskgroup_1 threads [21890, 21891], taskgroup_2 threads [21934, 21935] cpu-soaker-21890 [003] 865.895494 cpu-soaker-21935 [003] 865.897506 cpu-soaker-21934 [003] 865.899520 cpu-soaker-21935 [003] 865.901532 cpu-soaker-21934 [003] 865.903543 cpu-soaker-21935 [003] 865.905546 cpu-soaker-21891 [003] 865.907548 cpu-soaker-21890 [003] 865.909560 cpu-soaker-21891 [003] 865.911571 cpu-soaker-21890 [003] 865.913582 cpu-soaker-21891 [003] 865.915594 cpu-soaker-21934 [003] 865.917606 Similar problem is there when there are multiple taskgroups and say a task A preempts currently running task B of taskgroup_1. On schedule, pick_next_task can pick an unrelated task on taskgroup_2. Here it would be better to give some preference to task B on pick_next_task. A simple (may be extreme case) benchmark I tried was tbench with 2 tbench client processes with 2 threads each running on a single CPU. Avg throughput across 5 50 sec runs was: BEFORE: 105.84 MB/sec AFTER: 112.42 MB/sec Signed-off-by: Venkatesh Pallipadi Acked-by: Rik van Riel Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302802253-25760-1-git-send-email-venki@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 501ab637b94b..5280272cce3e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1344,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } +static void set_next_buddy(struct sched_entity *se); + /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and @@ -1353,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_sleep = flags & DEQUEUE_SLEEP; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) + if (cfs_rq->load.weight) { + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); break; + } flags |= DEQUEUE_SLEEP; } @@ -1877,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; if (unlikely(se == pse)) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); + next_buddy_marked = 1; + } /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1910,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ update_curr(cfs_rq); find_matching_se(&se, &pse); BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. + */ + if (!next_buddy_marked) + set_next_buddy(pse); goto preempt; + } return; -- cgit v1.2.2 From 931aeeda0dca81152aec48f30be01e86a268bf89 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 3 May 2011 22:31:07 +0400 Subject: sched: Remove unused 'this_best_prio arg' from balance_tasks() It's passed across multiple functions but is never really used, so remove it. Signed-off-by: Vladimir Davydov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1304447467-29200-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5280272cce3e..37f22626225e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2142,7 +2142,7 @@ static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct cfs_rq *busiest_cfs_rq) + struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0; long rem_load_move = max_load_move; @@ -2180,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, */ if (rem_load_move <= 0) break; - - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; } out: /* @@ -2242,7 +2239,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { long rem_load_move = max_load_move; int busiest_cpu = cpu_of(busiest); @@ -2267,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, busiest_cfs_rq); if (!moved_load) @@ -2293,11 +2290,11 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); + &busiest->cfs); } #endif @@ -2314,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { unsigned long total_load_moved = 0, load_moved; - int this_best_prio = this_rq->curr->prio; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); + sd, idle, all_pinned); total_load_moved += load_moved; -- cgit v1.2.2 From 1399fa7807a1a5998bbf147e80668e9950661dfa Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:39 -0700 Subject: sched: Introduce SCHED_POWER_SCALE to scale cpu_power calculations SCHED_LOAD_SCALE is used to increase nice resolution and to scale cpu_power calculations in the scheduler. This patch introduces SCHED_POWER_SCALE and converts all uses of SCHED_LOAD_SCALE for scaling cpu_power to use SCHED_POWER_SCALE instead. This is a preparatory patch for increasing the resolution of SCHED_LOAD_SCALE, and there is no need to increase resolution for cpu_power calculations. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-3-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37f22626225e..e32a9b70ee9c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) nr_running += cpu_rq(i)->cfs.nr_running; } - capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (tmp->flags & SD_POWERSAVINGS_BALANCE) nr_running /= 2; @@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { - return SCHED_LOAD_SCALE; + return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) @@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu) available = total - rq->rt_avg; } - if (unlikely((s64)total < SCHED_LOAD_SCALE)) - total = SCHED_LOAD_SCALE; + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; - total >>= SCHED_LOAD_SHIFT; + total >>= SCHED_POWER_SHIFT; return div_u64(available, total); } @@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_LOAD_SCALE; + unsigned long power = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { @@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_smt_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; } sdg->cpu_power_orig = power; @@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_freq_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; power *= scale_rt_power(cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; if (!power) power = 1; @@ -2682,7 +2682,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_LOAD_SCALE + * Only siblings can have significantly less than SCHED_POWER_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger @@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); sgs->group_weight = group->group_weight; @@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, return 0; *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, - SCHED_LOAD_SCALE); + SCHED_POWER_SCALE); return 1; } @@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, cpu_avg_load_per_task(this_cpu); scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_LOAD_SCALE; + * SCHED_POWER_SCALE; scaled_busy_load_per_task /= sds->busiest->cpu_power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= @@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, min(sds->busiest_load_per_task, sds->max_load); pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_LOAD_SCALE; + pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->busiest->cpu_power; if (sds->max_load > tmp) pwr_move += sds->busiest->cpu_power * @@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, /* Amount of load we'd add */ if (sds->max_load * sds->busiest->cpu_power < - sds->busiest_load_per_task * SCHED_LOAD_SCALE) + sds->busiest_load_per_task * SCHED_POWER_SCALE) tmp = (sds->max_load * sds->busiest->cpu_power) / sds->this->cpu_power; else - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->this->cpu_power; pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; + pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) @@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity = (sds->busiest_nr_running - sds->busiest_group_capacity); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= sds->busiest->cpu_power; } @@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, (sds->avg_load - sds->this_load) * sds->this->cpu_power) - / SCHED_LOAD_SCALE; + / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; /* * If the busiest group is imbalanced the below checks don't @@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); unsigned long wl; if (!capacity) @@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_LOAD_SCALE) / power; + wl = (wl * SCHED_POWER_SCALE) / power; if (wl > max_load) { max_load = wl; -- cgit v1.2.2 From 1e876231785d82443a5ac8b6c660e9f51bc5dede Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 May 2011 16:21:10 -0700 Subject: sched: Fix ->min_vruntime calculation in dequeue_entity() Dima Zavin reported: "After pulling the thread off the run-queue during a cgroup change, the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime then gets normalized to this new value. This can then lead to the thread getting an unfair boost in the new group if the vruntime of the next task in the old run-queue was way further ahead." Reported-by: Dima Zavin Signed-off-by: John Stultz Recalls-having-tested-once-upon-a-time-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305674470-23727-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e32a9b70ee9c..433491c2dc8f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); } /* -- cgit v1.2.2 From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2011 13:00:06 +0200 Subject: sched: Break out cpu_power from the sched_group structure In order to prepare for non-unique sched_groups per domain, we need to carry the cpu_power elsewhere, so put a level of indirection in. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8f..c768588e180b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; if (local_group) { this_load = avg_load; @@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->cpu_power_orig = power; + sdg->sgp->power_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->cpu_power = power; + sdg->sgp->power = power; } static void update_group_power(struct sched_domain *sd, int cpu) @@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->cpu_power; + power += group->sgp->power; group = group->next; } while (group != child->groups); - sdg->cpu_power = power; + sdg->sgp->power = power; } /* @@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * If ~90% of the cpu_power is still there, we're good. */ - if (group->cpu_power * 32 > group->cpu_power_orig * 29) + if (group->sgp->power * 32 > group->sgp->power_orig * 29) return 1; return 0; @@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; /* * Consider the group unbalanced when the imbalance is larger @@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); @@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += sg->cpu_power; + sds->total_pwr += sg->sgp->power; /* * In case the child domain prefers tasks go to siblings @@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, if (this_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->cpu_power; + scaled_busy_load_per_task /= sds->busiest->sgp->power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->cpu_power * + pwr_now += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->cpu_power * + pwr_now += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->cpu_power; + sds->busiest->sgp->power; if (sds->max_load > tmp) - pwr_move += sds->busiest->cpu_power * + pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->cpu_power < + if (sds->max_load * sds->busiest->sgp->power < sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->cpu_power) / - sds->this->cpu_power; + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; else tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->cpu_power; - pwr_move += sds->this->cpu_power * + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->cpu_power; + load_above_capacity /= sds->busiest->sgp->power; } /* @@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->cpu_power, - (sds->avg_load - sds->this_load) * sds->this->cpu_power) + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; /* -- cgit v1.2.2