aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 00:21:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 00:21:34 -0500
commit86c6a2fddf0b89b494c7616f2c06cf915c4bff01 (patch)
tree0e6930c93e5d49ead71b17fcadf0cc9ba28c3d2d /kernel/sched/core.c
parentbee2782f30f66898be3f74ad02e4d1f87a969694 (diff)
parentfd7de1e8d5b2b2b35e71332fafb899f584597150 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle are: - 'Nested Sleep Debugging', activated when CONFIG_DEBUG_ATOMIC_SLEEP=y. This instruments might_sleep() checks to catch places that nest blocking primitives - such as mutex usage in a wait loop. Such bugs can result in hard to debug races/hangs. Another category of invalid nesting that this facility will detect is the calling of blocking functions from within schedule() -> sched_submit_work() -> blk_schedule_flush_plug(). There's some potential for false positives (if secondary blocking primitives themselves are not ready yet for this facility), but the kernel will warn once about such bugs per bootup, so the warning isn't much of a nuisance. This feature comes with a number of fixes, for problems uncovered with it, so no messages are expected normally. - Another round of sched/numa optimizations and refinements, for CONFIG_NUMA_BALANCING=y. - Another round of sched/dl fixes and refinements. Plus various smaller fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched: Add missing rcu protection to wake_up_all_idle_cpus sched/deadline: Introduce start_hrtick_dl() for !CONFIG_SCHED_HRTICK sched/numa: Init numa balancing fields of init_task sched/deadline: Remove unnecessary definitions in cpudeadline.h sched/cpupri: Remove unnecessary definitions in cpupri.h sched/deadline: Fix rq->dl.pushable_tasks bug in push_dl_task() sched/fair: Fix stale overloaded status in the busiest group finding logic sched: Move p->nr_cpus_allowed check to select_task_rq() sched/completion: Document when to use wait_for_completion_io_*() sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC sched/fair: Kill task_struct::numa_entry and numa_group::task_list sched: Refactor task_struct to use numa_faults instead of numa_* pointers sched/deadline: Don't check CONFIG_SMP in switched_from_dl() sched/deadline: Reschedule from switched_from_dl() after a successful pull sched/deadline: Push task away if the deadline is equal to curr during wakeup sched/deadline: Add deadline rq status print sched/deadline: Fix artificial overrun introduced by yield_task_dl() sched/rt: Clean up check_preempt_equal_prio() sched/core: Use dl_bw_of() under rcu_read_lock_sched() sched: Check if we got a shallowest_idle_cpu before searching for least_loaded_cpu ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c241
1 files changed, 185 insertions, 56 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2826,15 +2813,8 @@ need_resched:
2826 rq->curr = next; 2813 rq->curr = next;
2827 ++*switch_count; 2814 ++*switch_count;
2828 2815
2829 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2830 /* 2817 cpu = cpu_of(rq);
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 } else 2818 } else
2839 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2840 2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653#endif 4633#endif
4654} 4634}
4655 4635
4636int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4637 const struct cpumask *trial)
4638{
4639 int ret = 1, trial_cpus;
4640 struct dl_bw *cur_dl_b;
4641 unsigned long flags;
4642
4643 rcu_read_lock_sched();
4644 cur_dl_b = dl_bw_of(cpumask_any(cur));
4645 trial_cpus = cpumask_weight(trial);
4646
4647 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4648 if (cur_dl_b->bw != -1 &&
4649 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4650 ret = 0;
4651 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4652 rcu_read_unlock_sched();
4653
4654 return ret;
4655}
4656
4657int task_can_attach(struct task_struct *p,
4658 const struct cpumask *cs_cpus_allowed)
4659{
4660 int ret = 0;
4661
4662 /*
4663 * Kthreads which disallow setaffinity shouldn't be moved
4664 * to a new cpuset; we don't want to change their cpu
4665 * affinity and isolating such threads by their set of
4666 * allowed nodes is unnecessary. Thus, cpusets are not
4667 * applicable for such threads. This prevents checking for
4668 * success of set_cpus_allowed_ptr() on all attached tasks
4669 * before cpus_allowed may be changed.
4670 */
4671 if (p->flags & PF_NO_SETAFFINITY) {
4672 ret = -EINVAL;
4673 goto out;
4674 }
4675
4676#ifdef CONFIG_SMP
4677 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4678 cs_cpus_allowed)) {
4679 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4680 cs_cpus_allowed);
4681 struct dl_bw *dl_b;
4682 bool overflow;
4683 int cpus;
4684 unsigned long flags;
4685
4686 rcu_read_lock_sched();
4687 dl_b = dl_bw_of(dest_cpu);
4688 raw_spin_lock_irqsave(&dl_b->lock, flags);
4689 cpus = dl_bw_cpus(dest_cpu);
4690 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4691 if (overflow)
4692 ret = -EBUSY;
4693 else {
4694 /*
4695 * We reserve space for this task in the destination
4696 * root_domain, as we can't fail after this point.
4697 * We will free resources in the source root_domain
4698 * later on (see set_cpus_allowed_dl()).
4699 */
4700 __dl_add(dl_b, p->dl.dl_bw);
4701 }
4702 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4703 rcu_read_unlock_sched();
4704
4705 }
4706#endif
4707out:
4708 return ret;
4709}
4710
4656#ifdef CONFIG_SMP 4711#ifdef CONFIG_SMP
4657/* 4712/*
4658 * move_queued_task - move a queued task to new rq. 4713 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103 6158
6104#ifdef CONFIG_NUMA 6159#ifdef CONFIG_NUMA
6105static int sched_domains_numa_levels; 6160static int sched_domains_numa_levels;
6161enum numa_topology_type sched_numa_topology_type;
6106static int *sched_domains_numa_distance; 6162static int *sched_domains_numa_distance;
6163int sched_max_numa_distance;
6107static struct cpumask ***sched_domains_numa_masks; 6164static struct cpumask ***sched_domains_numa_masks;
6108static int sched_domains_curr_level; 6165static int sched_domains_curr_level;
6109#endif 6166#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275 printk(KERN_WARNING "\n"); 6332 printk(KERN_WARNING "\n");
6276} 6333}
6277 6334
6278static bool find_numa_distance(int distance) 6335bool find_numa_distance(int distance)
6279{ 6336{
6280 int i; 6337 int i;
6281 6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290 return false; 6347 return false;
6291} 6348}
6292 6349
6350/*
6351 * A system can have three types of NUMA topology:
6352 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6353 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6354 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6355 *
6356 * The difference between a glueless mesh topology and a backplane
6357 * topology lies in whether communication between not directly
6358 * connected nodes goes through intermediary nodes (where programs
6359 * could run), or through backplane controllers. This affects
6360 * placement of programs.
6361 *
6362 * The type of topology can be discerned with the following tests:
6363 * - If the maximum distance between any nodes is 1 hop, the system
6364 * is directly connected.
6365 * - If for two nodes A and B, located N > 1 hops away from each other,
6366 * there is an intermediary node C, which is < N hops away from both
6367 * nodes A and B, the system is a glueless mesh.
6368 */
6369static void init_numa_topology_type(void)
6370{
6371 int a, b, c, n;
6372
6373 n = sched_max_numa_distance;
6374
6375 if (n <= 1)
6376 sched_numa_topology_type = NUMA_DIRECT;
6377
6378 for_each_online_node(a) {
6379 for_each_online_node(b) {
6380 /* Find two nodes furthest removed from each other. */
6381 if (node_distance(a, b) < n)
6382 continue;
6383
6384 /* Is there an intermediary node between a and b? */
6385 for_each_online_node(c) {
6386 if (node_distance(a, c) < n &&
6387 node_distance(b, c) < n) {
6388 sched_numa_topology_type =
6389 NUMA_GLUELESS_MESH;
6390 return;
6391 }
6392 }
6393
6394 sched_numa_topology_type = NUMA_BACKPLANE;
6395 return;
6396 }
6397 }
6398}
6399
6293static void sched_init_numa(void) 6400static void sched_init_numa(void)
6294{ 6401{
6295 int next_distance, curr_distance = node_distance(0, 0); 6402 int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426 sched_domain_topology = tl; 6533 sched_domain_topology = tl;
6427 6534
6428 sched_domains_numa_levels = level; 6535 sched_domains_numa_levels = level;
6536 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6537
6538 init_numa_topology_type();
6429} 6539}
6430 6540
6431static void sched_domains_numa_masks_set(int cpu) 6541static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178 7288
7179void __might_sleep(const char *file, int line, int preempt_offset) 7289void __might_sleep(const char *file, int line, int preempt_offset)
7180{ 7290{
7291 /*
7292 * Blocking primitives will set (and therefore destroy) current->state,
7293 * since we will exit with TASK_RUNNING make sure we enter with it,
7294 * otherwise we will destroy state.
7295 */
7296 if (WARN_ONCE(current->state != TASK_RUNNING,
7297 "do not call blocking ops when !TASK_RUNNING; "
7298 "state=%lx set at [<%p>] %pS\n",
7299 current->state,
7300 (void *)current->task_state_change,
7301 (void *)current->task_state_change))
7302 __set_current_state(TASK_RUNNING);
7303
7304 ___might_sleep(file, line, preempt_offset);
7305}
7306EXPORT_SYMBOL(__might_sleep);
7307
7308void ___might_sleep(const char *file, int line, int preempt_offset)
7309{
7181 static unsigned long prev_jiffy; /* ratelimiting */ 7310 static unsigned long prev_jiffy; /* ratelimiting */
7182 7311
7183 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7312 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209#endif 7338#endif
7210 dump_stack(); 7339 dump_stack();
7211} 7340}
7212EXPORT_SYMBOL(__might_sleep); 7341EXPORT_SYMBOL(___might_sleep);
7213#endif 7342#endif
7214 7343
7215#ifdef CONFIG_MAGIC_SYSRQ 7344#ifdef CONFIG_MAGIC_SYSRQ