aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c241
1 files changed, 185 insertions, 56 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2826,15 +2813,8 @@ need_resched:
2826 rq->curr = next; 2813 rq->curr = next;
2827 ++*switch_count; 2814 ++*switch_count;
2828 2815
2829 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2830 /* 2817 cpu = cpu_of(rq);
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 } else 2818 } else
2839 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2840 2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653#endif 4633#endif
4654} 4634}
4655 4635
4636int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4637 const struct cpumask *trial)
4638{
4639 int ret = 1, trial_cpus;
4640 struct dl_bw *cur_dl_b;
4641 unsigned long flags;
4642
4643 rcu_read_lock_sched();
4644 cur_dl_b = dl_bw_of(cpumask_any(cur));
4645 trial_cpus = cpumask_weight(trial);
4646
4647 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4648 if (cur_dl_b->bw != -1 &&
4649 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4650 ret = 0;
4651 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4652 rcu_read_unlock_sched();
4653
4654 return ret;
4655}
4656
4657int task_can_attach(struct task_struct *p,
4658 const struct cpumask *cs_cpus_allowed)
4659{
4660 int ret = 0;
4661
4662 /*
4663 * Kthreads which disallow setaffinity shouldn't be moved
4664 * to a new cpuset; we don't want to change their cpu
4665 * affinity and isolating such threads by their set of
4666 * allowed nodes is unnecessary. Thus, cpusets are not
4667 * applicable for such threads. This prevents checking for
4668 * success of set_cpus_allowed_ptr() on all attached tasks
4669 * before cpus_allowed may be changed.
4670 */
4671 if (p->flags & PF_NO_SETAFFINITY) {
4672 ret = -EINVAL;
4673 goto out;
4674 }
4675
4676#ifdef CONFIG_SMP
4677 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4678 cs_cpus_allowed)) {
4679 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4680 cs_cpus_allowed);
4681 struct dl_bw *dl_b;
4682 bool overflow;
4683 int cpus;
4684 unsigned long flags;
4685
4686 rcu_read_lock_sched();
4687 dl_b = dl_bw_of(dest_cpu);
4688 raw_spin_lock_irqsave(&dl_b->lock, flags);
4689 cpus = dl_bw_cpus(dest_cpu);
4690 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4691 if (overflow)
4692 ret = -EBUSY;
4693 else {
4694 /*
4695 * We reserve space for this task in the destination
4696 * root_domain, as we can't fail after this point.
4697 * We will free resources in the source root_domain
4698 * later on (see set_cpus_allowed_dl()).
4699 */
4700 __dl_add(dl_b, p->dl.dl_bw);
4701 }
4702 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4703 rcu_read_unlock_sched();
4704
4705 }
4706#endif
4707out:
4708 return ret;
4709}
4710
4656#ifdef CONFIG_SMP 4711#ifdef CONFIG_SMP
4657/* 4712/*
4658 * move_queued_task - move a queued task to new rq. 4713 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103 6158
6104#ifdef CONFIG_NUMA 6159#ifdef CONFIG_NUMA
6105static int sched_domains_numa_levels; 6160static int sched_domains_numa_levels;
6161enum numa_topology_type sched_numa_topology_type;
6106static int *sched_domains_numa_distance; 6162static int *sched_domains_numa_distance;
6163int sched_max_numa_distance;
6107static struct cpumask ***sched_domains_numa_masks; 6164static struct cpumask ***sched_domains_numa_masks;
6108static int sched_domains_curr_level; 6165static int sched_domains_curr_level;
6109#endif 6166#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275 printk(KERN_WARNING "\n"); 6332 printk(KERN_WARNING "\n");
6276} 6333}
6277 6334
6278static bool find_numa_distance(int distance) 6335bool find_numa_distance(int distance)
6279{ 6336{
6280 int i; 6337 int i;
6281 6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290 return false; 6347 return false;
6291} 6348}
6292 6349
6350/*
6351 * A system can have three types of NUMA topology:
6352 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6353 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6354 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6355 *
6356 * The difference between a glueless mesh topology and a backplane
6357 * topology lies in whether communication between not directly
6358 * connected nodes goes through intermediary nodes (where programs
6359 * could run), or through backplane controllers. This affects
6360 * placement of programs.
6361 *
6362 * The type of topology can be discerned with the following tests:
6363 * - If the maximum distance between any nodes is 1 hop, the system
6364 * is directly connected.
6365 * - If for two nodes A and B, located N > 1 hops away from each other,
6366 * there is an intermediary node C, which is < N hops away from both
6367 * nodes A and B, the system is a glueless mesh.
6368 */
6369static void init_numa_topology_type(void)
6370{
6371 int a, b, c, n;
6372
6373 n = sched_max_numa_distance;
6374
6375 if (n <= 1)
6376 sched_numa_topology_type = NUMA_DIRECT;
6377
6378 for_each_online_node(a) {
6379 for_each_online_node(b) {
6380 /* Find two nodes furthest removed from each other. */
6381 if (node_distance(a, b) < n)
6382 continue;
6383
6384 /* Is there an intermediary node between a and b? */
6385 for_each_online_node(c) {
6386 if (node_distance(a, c) < n &&
6387 node_distance(b, c) < n) {
6388 sched_numa_topology_type =
6389 NUMA_GLUELESS_MESH;
6390 return;
6391 }
6392 }
6393
6394 sched_numa_topology_type = NUMA_BACKPLANE;
6395 return;
6396 }
6397 }
6398}
6399
6293static void sched_init_numa(void) 6400static void sched_init_numa(void)
6294{ 6401{
6295 int next_distance, curr_distance = node_distance(0, 0); 6402 int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426 sched_domain_topology = tl; 6533 sched_domain_topology = tl;
6427 6534
6428 sched_domains_numa_levels = level; 6535 sched_domains_numa_levels = level;
6536 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6537
6538 init_numa_topology_type();
6429} 6539}
6430 6540
6431static void sched_domains_numa_masks_set(int cpu) 6541static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178 7288
7179void __might_sleep(const char *file, int line, int preempt_offset) 7289void __might_sleep(const char *file, int line, int preempt_offset)
7180{ 7290{
7291 /*
7292 * Blocking primitives will set (and therefore destroy) current->state,
7293 * since we will exit with TASK_RUNNING make sure we enter with it,
7294 * otherwise we will destroy state.
7295 */
7296 if (WARN_ONCE(current->state != TASK_RUNNING,
7297 "do not call blocking ops when !TASK_RUNNING; "
7298 "state=%lx set at [<%p>] %pS\n",
7299 current->state,
7300 (void *)current->task_state_change,
7301 (void *)current->task_state_change))
7302 __set_current_state(TASK_RUNNING);
7303
7304 ___might_sleep(file, line, preempt_offset);
7305}
7306EXPORT_SYMBOL(__might_sleep);
7307
7308void ___might_sleep(const char *file, int line, int preempt_offset)
7309{
7181 static unsigned long prev_jiffy; /* ratelimiting */ 7310 static unsigned long prev_jiffy; /* ratelimiting */
7182 7311
7183 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7312 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209#endif 7338#endif
7210 dump_stack(); 7339 dump_stack();
7211} 7340}
7212EXPORT_SYMBOL(__might_sleep); 7341EXPORT_SYMBOL(___might_sleep);
7213#endif 7342#endif
7214 7343
7215#ifdef CONFIG_MAGIC_SYSRQ 7344#ifdef CONFIG_MAGIC_SYSRQ