aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 02:55:11 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 02:55:11 -0500
commitfe8a45df368038566c62bf311accf4319b210123 (patch)
treec39ab7df07ccf71eaeed742d3a575c269f992cbc /kernel
parent5e30025a319910695f5010dc0fb53a23299da14d (diff)
parent85b088e934b9943322bfe37077289ae60f1b3414 (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Four bugfixes and one performance fix" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Avoid integer overflow sched: Optimize task_sched_runtime() sched/numa: Cure update_numa_stats() vs. hotplug sched/numa: Fix NULL pointer dereference in task_numa_migrate() sched: Fix endless sync_sched/rcu() loop inside _cpu_down()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/sched/core.c14
-rw-r--r--kernel/sched/fair.c31
3 files changed, 46 insertions, 4 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 973d034acf84..deff2e693766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
306 __func__, cpu); 306 __func__, cpu);
307 goto out_release; 307 goto out_release;
308 } 308 }
309 smpboot_park_threads(cpu);
310 309
311 /* 310 /*
312 * By now we've cleared cpu_active_mask, wait for all preempt-disabled 311 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
315 * 314 *
316 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might 315 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317 * not imply sync_sched(), so explicitly call both. 316 * not imply sync_sched(), so explicitly call both.
317 *
318 * Do sync before park smpboot threads to take care the rcu boost case.
318 */ 319 */
319#ifdef CONFIG_PREEMPT 320#ifdef CONFIG_PREEMPT
320 synchronize_sched(); 321 synchronize_sched();
321#endif 322#endif
322 synchronize_rcu(); 323 synchronize_rcu();
323 324
325 smpboot_park_threads(cpu);
326
324 /* 327 /*
325 * So now all preempt/rcu users must observe !cpu_active(). 328 * So now all preempt/rcu users must observe !cpu_active().
326 */ 329 */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1deccd78be98..c1808606ee5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2253 struct rq *rq; 2253 struct rq *rq;
2254 u64 ns = 0; 2254 u64 ns = 0;
2255 2255
2256#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2257 /*
2258 * 64-bit doesn't need locks to atomically read a 64bit value.
2259 * So we have a optimization chance when the task's delta_exec is 0.
2260 * Reading ->on_cpu is racy, but this is ok.
2261 *
2262 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2263 * If we race with it entering cpu, unaccounted time is 0. This is
2264 * indistinguishable from the read occurring a few cycles earlier.
2265 */
2266 if (!p->on_cpu)
2267 return p->se.sum_exec_runtime;
2268#endif
2269
2256 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2257 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2271 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2258 task_rq_unlock(rq, p, &flags); 2272 task_rq_unlock(rq, p, &flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df77c605c7a6..e8b652ebe027 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1000,7 +1000,7 @@ struct numa_stats {
1000 */ 1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid) 1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{ 1002{
1003 int cpu; 1003 int cpu, cpus = 0;
1004 1004
1005 memset(ns, 0, sizeof(*ns)); 1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) { 1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1009 ns->nr_running += rq->nr_running; 1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu); 1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu); 1011 ns->power += power_of(cpu);
1012
1013 cpus++;
1012 } 1014 }
1013 1015
1016 /*
1017 * If we raced with hotplug and there are no CPUs left in our mask
1018 * the @ns structure is NULL'ed and task_numa_compare() will
1019 * not find this node attractive.
1020 *
1021 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
1022 * and bail there.
1023 */
1024 if (!cpus)
1025 return;
1026
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; 1027 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); 1028 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity); 1029 ns->has_capacity = (ns->nr_running < ns->capacity);
@@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p)
1201 */ 1214 */
1202 rcu_read_lock(); 1215 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); 1216 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 1217 if (sd)
1218 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock(); 1219 rcu_read_unlock();
1206 1220
1221 /*
1222 * Cpusets can break the scheduler domain tree into smaller
1223 * balance domains, some of which do not cross NUMA boundaries.
1224 * Tasks that are "trapped" in such domains cannot be migrated
1225 * elsewhere, so there is no point in (re)trying.
1226 */
1227 if (unlikely(!sd)) {
1228 p->numa_preferred_nid = cpu_to_node(task_cpu(p));
1229 return -EINVAL;
1230 }
1231
1207 taskweight = task_weight(p, env.src_nid); 1232 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid); 1233 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid); 1234 update_numa_stats(&env.src_stats, env.src_nid);
@@ -2153,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2153 long contrib; 2178 long contrib;
2154 2179
2155 /* The fraction of a cpu used by this cfs_rq */ 2180 /* The fraction of a cpu used by this cfs_rq */
2156 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, 2181 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2157 sa->runnable_avg_period + 1); 2182 sa->runnable_avg_period + 1);
2158 contrib -= cfs_rq->tg_runnable_contrib; 2183 contrib -= cfs_rq->tg_runnable_contrib;
2159 2184