diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-14 02:55:11 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-14 02:55:11 -0500 |
| commit | fe8a45df368038566c62bf311accf4319b210123 (patch) | |
| tree | c39ab7df07ccf71eaeed742d3a575c269f992cbc | |
| parent | 5e30025a319910695f5010dc0fb53a23299da14d (diff) | |
| parent | 85b088e934b9943322bfe37077289ae60f1b3414 (diff) | |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Four bugfixes and one performance fix"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Avoid integer overflow
sched: Optimize task_sched_runtime()
sched/numa: Cure update_numa_stats() vs. hotplug
sched/numa: Fix NULL pointer dereference in task_numa_migrate()
sched: Fix endless sync_sched/rcu() loop inside _cpu_down()
| -rw-r--r-- | kernel/cpu.c | 5 | ||||
| -rw-r--r-- | kernel/sched/core.c | 14 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 31 |
3 files changed, 46 insertions, 4 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 973d034acf84..deff2e693766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 306 | __func__, cpu); | 306 | __func__, cpu); |
| 307 | goto out_release; | 307 | goto out_release; |
| 308 | } | 308 | } |
| 309 | smpboot_park_threads(cpu); | ||
| 310 | 309 | ||
| 311 | /* | 310 | /* |
| 312 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled | 311 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled |
| @@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 315 | * | 314 | * |
| 316 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | 315 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might |
| 317 | * not imply sync_sched(), so explicitly call both. | 316 | * not imply sync_sched(), so explicitly call both. |
| 317 | * | ||
| 318 | * Do sync before park smpboot threads to take care the rcu boost case. | ||
| 318 | */ | 319 | */ |
| 319 | #ifdef CONFIG_PREEMPT | 320 | #ifdef CONFIG_PREEMPT |
| 320 | synchronize_sched(); | 321 | synchronize_sched(); |
| 321 | #endif | 322 | #endif |
| 322 | synchronize_rcu(); | 323 | synchronize_rcu(); |
| 323 | 324 | ||
| 325 | smpboot_park_threads(cpu); | ||
| 326 | |||
| 324 | /* | 327 | /* |
| 325 | * So now all preempt/rcu users must observe !cpu_active(). | 328 | * So now all preempt/rcu users must observe !cpu_active(). |
| 326 | */ | 329 | */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1deccd78be98..c1808606ee5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2253 | struct rq *rq; | 2253 | struct rq *rq; |
| 2254 | u64 ns = 0; | 2254 | u64 ns = 0; |
| 2255 | 2255 | ||
| 2256 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | ||
| 2257 | /* | ||
| 2258 | * 64-bit doesn't need locks to atomically read a 64bit value. | ||
| 2259 | * So we have a optimization chance when the task's delta_exec is 0. | ||
| 2260 | * Reading ->on_cpu is racy, but this is ok. | ||
| 2261 | * | ||
| 2262 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | ||
| 2263 | * If we race with it entering cpu, unaccounted time is 0. This is | ||
| 2264 | * indistinguishable from the read occurring a few cycles earlier. | ||
| 2265 | */ | ||
| 2266 | if (!p->on_cpu) | ||
| 2267 | return p->se.sum_exec_runtime; | ||
| 2268 | #endif | ||
| 2269 | |||
| 2256 | rq = task_rq_lock(p, &flags); | 2270 | rq = task_rq_lock(p, &flags); |
| 2257 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 2271 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
| 2258 | task_rq_unlock(rq, p, &flags); | 2272 | task_rq_unlock(rq, p, &flags); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df77c605c7a6..e8b652ebe027 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -1000,7 +1000,7 @@ struct numa_stats { | |||
| 1000 | */ | 1000 | */ |
| 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) | 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) |
| 1002 | { | 1002 | { |
| 1003 | int cpu; | 1003 | int cpu, cpus = 0; |
| 1004 | 1004 | ||
| 1005 | memset(ns, 0, sizeof(*ns)); | 1005 | memset(ns, 0, sizeof(*ns)); |
| 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
| @@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
| 1009 | ns->nr_running += rq->nr_running; | 1009 | ns->nr_running += rq->nr_running; |
| 1010 | ns->load += weighted_cpuload(cpu); | 1010 | ns->load += weighted_cpuload(cpu); |
| 1011 | ns->power += power_of(cpu); | 1011 | ns->power += power_of(cpu); |
| 1012 | |||
| 1013 | cpus++; | ||
| 1012 | } | 1014 | } |
| 1013 | 1015 | ||
| 1016 | /* | ||
| 1017 | * If we raced with hotplug and there are no CPUs left in our mask | ||
| 1018 | * the @ns structure is NULL'ed and task_numa_compare() will | ||
| 1019 | * not find this node attractive. | ||
| 1020 | * | ||
| 1021 | * We'll either bail at !has_capacity, or we'll detect a huge imbalance | ||
| 1022 | * and bail there. | ||
| 1023 | */ | ||
| 1024 | if (!cpus) | ||
| 1025 | return; | ||
| 1026 | |||
| 1014 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | 1027 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; |
| 1015 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | 1028 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); |
| 1016 | ns->has_capacity = (ns->nr_running < ns->capacity); | 1029 | ns->has_capacity = (ns->nr_running < ns->capacity); |
| @@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1201 | */ | 1214 | */ |
| 1202 | rcu_read_lock(); | 1215 | rcu_read_lock(); |
| 1203 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | 1216 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); |
| 1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | 1217 | if (sd) |
| 1218 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 1205 | rcu_read_unlock(); | 1219 | rcu_read_unlock(); |
| 1206 | 1220 | ||
| 1221 | /* | ||
| 1222 | * Cpusets can break the scheduler domain tree into smaller | ||
| 1223 | * balance domains, some of which do not cross NUMA boundaries. | ||
| 1224 | * Tasks that are "trapped" in such domains cannot be migrated | ||
| 1225 | * elsewhere, so there is no point in (re)trying. | ||
| 1226 | */ | ||
| 1227 | if (unlikely(!sd)) { | ||
| 1228 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | ||
| 1229 | return -EINVAL; | ||
| 1230 | } | ||
| 1231 | |||
| 1207 | taskweight = task_weight(p, env.src_nid); | 1232 | taskweight = task_weight(p, env.src_nid); |
| 1208 | groupweight = group_weight(p, env.src_nid); | 1233 | groupweight = group_weight(p, env.src_nid); |
| 1209 | update_numa_stats(&env.src_stats, env.src_nid); | 1234 | update_numa_stats(&env.src_stats, env.src_nid); |
| @@ -2153,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
| 2153 | long contrib; | 2178 | long contrib; |
| 2154 | 2179 | ||
| 2155 | /* The fraction of a cpu used by this cfs_rq */ | 2180 | /* The fraction of a cpu used by this cfs_rq */ |
| 2156 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | 2181 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
| 2157 | sa->runnable_avg_period + 1); | 2182 | sa->runnable_avg_period + 1); |
| 2158 | contrib -= cfs_rq->tg_runnable_contrib; | 2183 | contrib -= cfs_rq->tg_runnable_contrib; |
| 2159 | 2184 | ||
