diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 11 | ||||
-rw-r--r-- | kernel/cpuset.c | 23 | ||||
-rw-r--r-- | kernel/exit.c | 5 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 8 | ||||
-rw-r--r-- | kernel/module.c | 30 | ||||
-rw-r--r-- | kernel/sched/completion.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 241 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 3 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 3 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 99 | ||||
-rw-r--r-- | kernel/sched/debug.c | 11 | ||||
-rw-r--r-- | kernel/sched/fair.c | 354 | ||||
-rw-r--r-- | kernel/sched/rt.c | 17 | ||||
-rw-r--r-- | kernel/sched/sched.h | 43 | ||||
-rw-r--r-- | kernel/sched/wait.c | 66 | ||||
-rw-r--r-- | kernel/smpboot.c | 15 |
16 files changed, 721 insertions, 213 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index cebb11db4d34..1f37f15117e5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) | |||
499 | set_freezable(); | 499 | set_freezable(); |
500 | while (!kthread_should_stop()) { | 500 | while (!kthread_should_stop()) { |
501 | struct sk_buff *skb; | 501 | struct sk_buff *skb; |
502 | DECLARE_WAITQUEUE(wait, current); | ||
503 | 502 | ||
504 | flush_hold_queue(); | 503 | flush_hold_queue(); |
505 | 504 | ||
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) | |||
514 | audit_printk_skb(skb); | 513 | audit_printk_skb(skb); |
515 | continue; | 514 | continue; |
516 | } | 515 | } |
517 | set_current_state(TASK_INTERRUPTIBLE); | ||
518 | add_wait_queue(&kauditd_wait, &wait); | ||
519 | 516 | ||
520 | if (!skb_queue_len(&audit_skb_queue)) { | 517 | wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); |
521 | try_to_freeze(); | ||
522 | schedule(); | ||
523 | } | ||
524 | |||
525 | __set_current_state(TASK_RUNNING); | ||
526 | remove_wait_queue(&kauditd_wait, &wait); | ||
527 | } | 518 | } |
528 | return 0; | 519 | return 0; |
529 | } | 520 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..723cfc9d0ad7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
506 | goto out; | 506 | goto out; |
507 | } | 507 | } |
508 | 508 | ||
509 | /* | ||
510 | * We can't shrink if we won't have enough room for SCHED_DEADLINE | ||
511 | * tasks. | ||
512 | */ | ||
513 | ret = -EBUSY; | ||
514 | if (is_cpu_exclusive(cur) && | ||
515 | !cpuset_cpumask_can_shrink(cur->cpus_allowed, | ||
516 | trial->cpus_allowed)) | ||
517 | goto out; | ||
518 | |||
509 | ret = 0; | 519 | ret = 0; |
510 | out: | 520 | out: |
511 | rcu_read_unlock(); | 521 | rcu_read_unlock(); |
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
1429 | goto out_unlock; | 1439 | goto out_unlock; |
1430 | 1440 | ||
1431 | cgroup_taskset_for_each(task, tset) { | 1441 | cgroup_taskset_for_each(task, tset) { |
1432 | /* | 1442 | ret = task_can_attach(task, cs->cpus_allowed); |
1433 | * Kthreads which disallow setaffinity shouldn't be moved | 1443 | if (ret) |
1434 | * to a new cpuset; we don't want to change their cpu | ||
1435 | * affinity and isolating such threads by their set of | ||
1436 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
1437 | * applicable for such threads. This prevents checking for | ||
1438 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
1439 | * before cpus_allowed may be changed. | ||
1440 | */ | ||
1441 | ret = -EINVAL; | ||
1442 | if (task->flags & PF_NO_SETAFFINITY) | ||
1443 | goto out_unlock; | 1444 | goto out_unlock; |
1444 | ret = security_task_setscheduler(task); | 1445 | ret = security_task_setscheduler(task); |
1445 | if (ret) | 1446 | if (ret) |
diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff953..232c4bc8bcc9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
997 | 997 | ||
998 | get_task_struct(p); | 998 | get_task_struct(p); |
999 | read_unlock(&tasklist_lock); | 999 | read_unlock(&tasklist_lock); |
1000 | sched_annotate_sleep(); | ||
1001 | |||
1000 | if ((exit_code & 0x7f) == 0) { | 1002 | if ((exit_code & 0x7f) == 0) { |
1001 | why = CLD_EXITED; | 1003 | why = CLD_EXITED; |
1002 | status = exit_code >> 8; | 1004 | status = exit_code >> 8; |
@@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1079 | * thread can reap it because we its state == DEAD/TRACE. | 1081 | * thread can reap it because we its state == DEAD/TRACE. |
1080 | */ | 1082 | */ |
1081 | read_unlock(&tasklist_lock); | 1083 | read_unlock(&tasklist_lock); |
1084 | sched_annotate_sleep(); | ||
1082 | 1085 | ||
1083 | retval = wo->wo_rusage | 1086 | retval = wo->wo_rusage |
1084 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1087 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
@@ -1210,6 +1213,7 @@ unlock_sig: | |||
1210 | pid = task_pid_vnr(p); | 1213 | pid = task_pid_vnr(p); |
1211 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1214 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
1212 | read_unlock(&tasklist_lock); | 1215 | read_unlock(&tasklist_lock); |
1216 | sched_annotate_sleep(); | ||
1213 | 1217 | ||
1214 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1218 | if (unlikely(wo->wo_flags & WNOWAIT)) |
1215 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1219 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); |
@@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1272 | pid = task_pid_vnr(p); | 1276 | pid = task_pid_vnr(p); |
1273 | get_task_struct(p); | 1277 | get_task_struct(p); |
1274 | read_unlock(&tasklist_lock); | 1278 | read_unlock(&tasklist_lock); |
1279 | sched_annotate_sleep(); | ||
1275 | 1280 | ||
1276 | if (!wo->wo_info) { | 1281 | if (!wo->wo_info) { |
1277 | retval = wo->wo_rusage | 1282 | retval = wo->wo_rusage |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -378,8 +378,14 @@ done: | |||
378 | * reschedule now, before we try-lock the mutex. This avoids getting | 378 | * reschedule now, before we try-lock the mutex. This avoids getting |
379 | * scheduled out right after we obtained the mutex. | 379 | * scheduled out right after we obtained the mutex. |
380 | */ | 380 | */ |
381 | if (need_resched()) | 381 | if (need_resched()) { |
382 | /* | ||
383 | * We _should_ have TASK_RUNNING here, but just in case | ||
384 | * we do not, make it so, otherwise we might get stuck. | ||
385 | */ | ||
386 | __set_current_state(TASK_RUNNING); | ||
382 | schedule_preempt_disabled(); | 387 | schedule_preempt_disabled(); |
388 | } | ||
383 | 389 | ||
384 | return false; | 390 | return false; |
385 | } | 391 | } |
diff --git a/kernel/module.c b/kernel/module.c index 88cec1ddb1e3..e52a8739361a 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -3097,6 +3097,32 @@ static int may_init_module(void) | |||
3097 | } | 3097 | } |
3098 | 3098 | ||
3099 | /* | 3099 | /* |
3100 | * Can't use wait_event_interruptible() because our condition | ||
3101 | * 'finished_loading()' contains a blocking primitive itself (mutex_lock). | ||
3102 | */ | ||
3103 | static int wait_finished_loading(struct module *mod) | ||
3104 | { | ||
3105 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | ||
3106 | int ret = 0; | ||
3107 | |||
3108 | add_wait_queue(&module_wq, &wait); | ||
3109 | for (;;) { | ||
3110 | if (finished_loading(mod->name)) | ||
3111 | break; | ||
3112 | |||
3113 | if (signal_pending(current)) { | ||
3114 | ret = -ERESTARTSYS; | ||
3115 | break; | ||
3116 | } | ||
3117 | |||
3118 | wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
3119 | } | ||
3120 | remove_wait_queue(&module_wq, &wait); | ||
3121 | |||
3122 | return ret; | ||
3123 | } | ||
3124 | |||
3125 | /* | ||
3100 | * We try to place it in the list now to make sure it's unique before | 3126 | * We try to place it in the list now to make sure it's unique before |
3101 | * we dedicate too many resources. In particular, temporary percpu | 3127 | * we dedicate too many resources. In particular, temporary percpu |
3102 | * memory exhaustion. | 3128 | * memory exhaustion. |
@@ -3116,8 +3142,8 @@ again: | |||
3116 | || old->state == MODULE_STATE_UNFORMED) { | 3142 | || old->state == MODULE_STATE_UNFORMED) { |
3117 | /* Wait in case it fails to load. */ | 3143 | /* Wait in case it fails to load. */ |
3118 | mutex_unlock(&module_mutex); | 3144 | mutex_unlock(&module_mutex); |
3119 | err = wait_event_interruptible(module_wq, | 3145 | |
3120 | finished_loading(mod->name)); | 3146 | err = wait_finished_loading(mod); |
3121 | if (err) | 3147 | if (err) |
3122 | goto out_unlocked; | 3148 | goto out_unlocked; |
3123 | goto again; | 3149 | goto again; |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
148 | * | 148 | * |
149 | * This waits to be signaled for completion of a specific task. It is NOT | 149 | * This waits to be signaled for completion of a specific task. It is NOT |
150 | * interruptible and there is no timeout. The caller is accounted as waiting | 150 | * interruptible and there is no timeout. The caller is accounted as waiting |
151 | * for IO. | 151 | * for IO (which traditionally means blkio only). |
152 | */ | 152 | */ |
153 | void __sched wait_for_completion_io(struct completion *x) | 153 | void __sched wait_for_completion_io(struct completion *x) |
154 | { | 154 | { |
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); | |||
163 | * | 163 | * |
164 | * This waits for either a completion of a specific task to be signaled or for a | 164 | * This waits for either a completion of a specific task to be signaled or for a |
165 | * specified timeout to expire. The timeout is in jiffies. It is not | 165 | * specified timeout to expire. The timeout is in jiffies. It is not |
166 | * interruptible. The caller is accounted as waiting for IO. | 166 | * interruptible. The caller is accounted as waiting for IO (which traditionally |
167 | * means blkio only). | ||
167 | * | 168 | * |
168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | 169 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
169 | * till timeout) if completed. | 170 | * till timeout) if completed. |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e67a6e88e125..bb398c0c5f08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) | |||
1008 | return cpu_curr(task_cpu(p)) == p; | 1008 | return cpu_curr(task_cpu(p)) == p; |
1009 | } | 1009 | } |
1010 | 1010 | ||
1011 | /* | ||
1012 | * Can drop rq->lock because from sched_class::switched_from() methods drop it. | ||
1013 | */ | ||
1011 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 1014 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
1012 | const struct sched_class *prev_class, | 1015 | const struct sched_class *prev_class, |
1013 | int oldprio) | 1016 | int oldprio) |
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1015 | if (prev_class != p->sched_class) { | 1018 | if (prev_class != p->sched_class) { |
1016 | if (prev_class->switched_from) | 1019 | if (prev_class->switched_from) |
1017 | prev_class->switched_from(rq, p); | 1020 | prev_class->switched_from(rq, p); |
1021 | /* Possble rq->lock 'hole'. */ | ||
1018 | p->sched_class->switched_to(rq, p); | 1022 | p->sched_class->switched_to(rq, p); |
1019 | } else if (oldprio != p->prio || dl_task(p)) | 1023 | } else if (oldprio != p->prio || dl_task(p)) |
1020 | p->sched_class->prio_changed(rq, p, oldprio); | 1024 | p->sched_class->prio_changed(rq, p, oldprio); |
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1054 | * ttwu() will sort out the placement. | 1058 | * ttwu() will sort out the placement. |
1055 | */ | 1059 | */ |
1056 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 1060 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
1057 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); | 1061 | !p->on_rq); |
1058 | 1062 | ||
1059 | #ifdef CONFIG_LOCKDEP | 1063 | #ifdef CONFIG_LOCKDEP |
1060 | /* | 1064 | /* |
@@ -1407,7 +1411,8 @@ out: | |||
1407 | static inline | 1411 | static inline |
1408 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | 1412 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
1409 | { | 1413 | { |
1410 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | 1414 | if (p->nr_cpus_allowed > 1) |
1415 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | ||
1411 | 1416 | ||
1412 | /* | 1417 | /* |
1413 | * In order not to call set_task_cpu() on a blocking task we need | 1418 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu) | |||
1623 | struct rq *rq = cpu_rq(cpu); | 1628 | struct rq *rq = cpu_rq(cpu); |
1624 | unsigned long flags; | 1629 | unsigned long flags; |
1625 | 1630 | ||
1626 | if (!is_idle_task(rq->curr)) | 1631 | rcu_read_lock(); |
1627 | return; | 1632 | |
1633 | if (!is_idle_task(rcu_dereference(rq->curr))) | ||
1634 | goto out; | ||
1628 | 1635 | ||
1629 | if (set_nr_if_polling(rq->idle)) { | 1636 | if (set_nr_if_polling(rq->idle)) { |
1630 | trace_sched_wake_idle_without_ipi(cpu); | 1637 | trace_sched_wake_idle_without_ipi(cpu); |
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu) | |||
1635 | /* Else cpu is not in idle, do nothing here */ | 1642 | /* Else cpu is not in idle, do nothing here */ |
1636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1643 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1637 | } | 1644 | } |
1645 | |||
1646 | out: | ||
1647 | rcu_read_unlock(); | ||
1638 | } | 1648 | } |
1639 | 1649 | ||
1640 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1650 | bool cpus_share_cache(int this_cpu, int that_cpu) |
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1853 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1863 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
1854 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1864 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
1855 | p->numa_work.next = &p->numa_work; | 1865 | p->numa_work.next = &p->numa_work; |
1856 | p->numa_faults_memory = NULL; | 1866 | p->numa_faults = NULL; |
1857 | p->numa_faults_buffer_memory = NULL; | ||
1858 | p->last_task_numa_placement = 0; | 1867 | p->last_task_numa_placement = 0; |
1859 | p->last_sum_exec_runtime = 0; | 1868 | p->last_sum_exec_runtime = 0; |
1860 | 1869 | ||
1861 | INIT_LIST_HEAD(&p->numa_entry); | ||
1862 | p->numa_group = NULL; | 1870 | p->numa_group = NULL; |
1863 | #endif /* CONFIG_NUMA_BALANCING */ | 1871 | #endif /* CONFIG_NUMA_BALANCING */ |
1864 | } | 1872 | } |
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i) | |||
2034 | } | 2042 | } |
2035 | #endif | 2043 | #endif |
2036 | 2044 | ||
2037 | static inline | ||
2038 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
2039 | { | ||
2040 | dl_b->total_bw -= tsk_bw; | ||
2041 | } | ||
2042 | |||
2043 | static inline | ||
2044 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
2045 | { | ||
2046 | dl_b->total_bw += tsk_bw; | ||
2047 | } | ||
2048 | |||
2049 | static inline | ||
2050 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
2051 | { | ||
2052 | return dl_b->bw != -1 && | ||
2053 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
2054 | } | ||
2055 | |||
2056 | /* | 2045 | /* |
2057 | * We must be sure that accepting a new task (or allowing changing the | 2046 | * We must be sure that accepting a new task (or allowing changing the |
2058 | * parameters of an existing one) is consistent with the bandwidth | 2047 | * parameters of an existing one) is consistent with the bandwidth |
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
2220 | 2209 | ||
2221 | /** | 2210 | /** |
2222 | * finish_task_switch - clean up after a task-switch | 2211 | * finish_task_switch - clean up after a task-switch |
2223 | * @rq: runqueue associated with task-switch | ||
2224 | * @prev: the thread we just switched away from. | 2212 | * @prev: the thread we just switched away from. |
2225 | * | 2213 | * |
2226 | * finish_task_switch must be called after the context switch, paired | 2214 | * finish_task_switch must be called after the context switch, paired |
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
2232 | * so, we finish that here outside of the runqueue lock. (Doing it | 2220 | * so, we finish that here outside of the runqueue lock. (Doing it |
2233 | * with the lock held can cause deadlocks; see schedule() for | 2221 | * with the lock held can cause deadlocks; see schedule() for |
2234 | * details.) | 2222 | * details.) |
2223 | * | ||
2224 | * The context switch have flipped the stack from under us and restored the | ||
2225 | * local variables which were saved when this task called schedule() in the | ||
2226 | * past. prev == current is still correct but we need to recalculate this_rq | ||
2227 | * because prev may have moved to another CPU. | ||
2235 | */ | 2228 | */ |
2236 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) | 2229 | static struct rq *finish_task_switch(struct task_struct *prev) |
2237 | __releases(rq->lock) | 2230 | __releases(rq->lock) |
2238 | { | 2231 | { |
2232 | struct rq *rq = this_rq(); | ||
2239 | struct mm_struct *mm = rq->prev_mm; | 2233 | struct mm_struct *mm = rq->prev_mm; |
2240 | long prev_state; | 2234 | long prev_state; |
2241 | 2235 | ||
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2275 | } | 2269 | } |
2276 | 2270 | ||
2277 | tick_nohz_task_switch(current); | 2271 | tick_nohz_task_switch(current); |
2272 | return rq; | ||
2278 | } | 2273 | } |
2279 | 2274 | ||
2280 | #ifdef CONFIG_SMP | 2275 | #ifdef CONFIG_SMP |
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq) | |||
2309 | asmlinkage __visible void schedule_tail(struct task_struct *prev) | 2304 | asmlinkage __visible void schedule_tail(struct task_struct *prev) |
2310 | __releases(rq->lock) | 2305 | __releases(rq->lock) |
2311 | { | 2306 | { |
2312 | struct rq *rq = this_rq(); | 2307 | struct rq *rq; |
2313 | |||
2314 | finish_task_switch(rq, prev); | ||
2315 | 2308 | ||
2316 | /* | 2309 | /* finish_task_switch() drops rq->lock and enables preemtion */ |
2317 | * FIXME: do we need to worry about rq being invalidated by the | 2310 | preempt_disable(); |
2318 | * task_switch? | 2311 | rq = finish_task_switch(prev); |
2319 | */ | ||
2320 | post_schedule(rq); | 2312 | post_schedule(rq); |
2313 | preempt_enable(); | ||
2321 | 2314 | ||
2322 | if (current->set_child_tid) | 2315 | if (current->set_child_tid) |
2323 | put_user(task_pid_vnr(current), current->set_child_tid); | 2316 | put_user(task_pid_vnr(current), current->set_child_tid); |
2324 | } | 2317 | } |
2325 | 2318 | ||
2326 | /* | 2319 | /* |
2327 | * context_switch - switch to the new MM and the new | 2320 | * context_switch - switch to the new MM and the new thread's register state. |
2328 | * thread's register state. | ||
2329 | */ | 2321 | */ |
2330 | static inline void | 2322 | static inline struct rq * |
2331 | context_switch(struct rq *rq, struct task_struct *prev, | 2323 | context_switch(struct rq *rq, struct task_struct *prev, |
2332 | struct task_struct *next) | 2324 | struct task_struct *next) |
2333 | { | 2325 | { |
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2366 | context_tracking_task_switch(prev, next); | 2358 | context_tracking_task_switch(prev, next); |
2367 | /* Here we just switch the register state and the stack. */ | 2359 | /* Here we just switch the register state and the stack. */ |
2368 | switch_to(prev, next, prev); | 2360 | switch_to(prev, next, prev); |
2369 | |||
2370 | barrier(); | 2361 | barrier(); |
2371 | /* | 2362 | |
2372 | * this_rq must be evaluated again because prev may have moved | 2363 | return finish_task_switch(prev); |
2373 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
2374 | * frame will be invalid. | ||
2375 | */ | ||
2376 | finish_task_switch(this_rq(), prev); | ||
2377 | } | 2364 | } |
2378 | 2365 | ||
2379 | /* | 2366 | /* |
@@ -2826,15 +2813,8 @@ need_resched: | |||
2826 | rq->curr = next; | 2813 | rq->curr = next; |
2827 | ++*switch_count; | 2814 | ++*switch_count; |
2828 | 2815 | ||
2829 | context_switch(rq, prev, next); /* unlocks the rq */ | 2816 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
2830 | /* | 2817 | cpu = cpu_of(rq); |
2831 | * The context switch have flipped the stack from under us | ||
2832 | * and restored the local variables which were saved when | ||
2833 | * this task called schedule() in the past. prev == current | ||
2834 | * is still correct, but it can be moved to another cpu/rq. | ||
2835 | */ | ||
2836 | cpu = smp_processor_id(); | ||
2837 | rq = cpu_rq(cpu); | ||
2838 | } else | 2818 | } else |
2839 | raw_spin_unlock_irq(&rq->lock); | 2819 | raw_spin_unlock_irq(&rq->lock); |
2840 | 2820 | ||
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4653 | #endif | 4633 | #endif |
4654 | } | 4634 | } |
4655 | 4635 | ||
4636 | int cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
4637 | const struct cpumask *trial) | ||
4638 | { | ||
4639 | int ret = 1, trial_cpus; | ||
4640 | struct dl_bw *cur_dl_b; | ||
4641 | unsigned long flags; | ||
4642 | |||
4643 | rcu_read_lock_sched(); | ||
4644 | cur_dl_b = dl_bw_of(cpumask_any(cur)); | ||
4645 | trial_cpus = cpumask_weight(trial); | ||
4646 | |||
4647 | raw_spin_lock_irqsave(&cur_dl_b->lock, flags); | ||
4648 | if (cur_dl_b->bw != -1 && | ||
4649 | cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) | ||
4650 | ret = 0; | ||
4651 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | ||
4652 | rcu_read_unlock_sched(); | ||
4653 | |||
4654 | return ret; | ||
4655 | } | ||
4656 | |||
4657 | int task_can_attach(struct task_struct *p, | ||
4658 | const struct cpumask *cs_cpus_allowed) | ||
4659 | { | ||
4660 | int ret = 0; | ||
4661 | |||
4662 | /* | ||
4663 | * Kthreads which disallow setaffinity shouldn't be moved | ||
4664 | * to a new cpuset; we don't want to change their cpu | ||
4665 | * affinity and isolating such threads by their set of | ||
4666 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
4667 | * applicable for such threads. This prevents checking for | ||
4668 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
4669 | * before cpus_allowed may be changed. | ||
4670 | */ | ||
4671 | if (p->flags & PF_NO_SETAFFINITY) { | ||
4672 | ret = -EINVAL; | ||
4673 | goto out; | ||
4674 | } | ||
4675 | |||
4676 | #ifdef CONFIG_SMP | ||
4677 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, | ||
4678 | cs_cpus_allowed)) { | ||
4679 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | ||
4680 | cs_cpus_allowed); | ||
4681 | struct dl_bw *dl_b; | ||
4682 | bool overflow; | ||
4683 | int cpus; | ||
4684 | unsigned long flags; | ||
4685 | |||
4686 | rcu_read_lock_sched(); | ||
4687 | dl_b = dl_bw_of(dest_cpu); | ||
4688 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
4689 | cpus = dl_bw_cpus(dest_cpu); | ||
4690 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
4691 | if (overflow) | ||
4692 | ret = -EBUSY; | ||
4693 | else { | ||
4694 | /* | ||
4695 | * We reserve space for this task in the destination | ||
4696 | * root_domain, as we can't fail after this point. | ||
4697 | * We will free resources in the source root_domain | ||
4698 | * later on (see set_cpus_allowed_dl()). | ||
4699 | */ | ||
4700 | __dl_add(dl_b, p->dl.dl_bw); | ||
4701 | } | ||
4702 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
4703 | rcu_read_unlock_sched(); | ||
4704 | |||
4705 | } | ||
4706 | #endif | ||
4707 | out: | ||
4708 | return ret; | ||
4709 | } | ||
4710 | |||
4656 | #ifdef CONFIG_SMP | 4711 | #ifdef CONFIG_SMP |
4657 | /* | 4712 | /* |
4658 | * move_queued_task - move a queued task to new rq. | 4713 | * move_queued_task - move a queued task to new rq. |
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
6103 | 6158 | ||
6104 | #ifdef CONFIG_NUMA | 6159 | #ifdef CONFIG_NUMA |
6105 | static int sched_domains_numa_levels; | 6160 | static int sched_domains_numa_levels; |
6161 | enum numa_topology_type sched_numa_topology_type; | ||
6106 | static int *sched_domains_numa_distance; | 6162 | static int *sched_domains_numa_distance; |
6163 | int sched_max_numa_distance; | ||
6107 | static struct cpumask ***sched_domains_numa_masks; | 6164 | static struct cpumask ***sched_domains_numa_masks; |
6108 | static int sched_domains_curr_level; | 6165 | static int sched_domains_curr_level; |
6109 | #endif | 6166 | #endif |
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str) | |||
6275 | printk(KERN_WARNING "\n"); | 6332 | printk(KERN_WARNING "\n"); |
6276 | } | 6333 | } |
6277 | 6334 | ||
6278 | static bool find_numa_distance(int distance) | 6335 | bool find_numa_distance(int distance) |
6279 | { | 6336 | { |
6280 | int i; | 6337 | int i; |
6281 | 6338 | ||
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance) | |||
6290 | return false; | 6347 | return false; |
6291 | } | 6348 | } |
6292 | 6349 | ||
6350 | /* | ||
6351 | * A system can have three types of NUMA topology: | ||
6352 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
6353 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
6354 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
6355 | * | ||
6356 | * The difference between a glueless mesh topology and a backplane | ||
6357 | * topology lies in whether communication between not directly | ||
6358 | * connected nodes goes through intermediary nodes (where programs | ||
6359 | * could run), or through backplane controllers. This affects | ||
6360 | * placement of programs. | ||
6361 | * | ||
6362 | * The type of topology can be discerned with the following tests: | ||
6363 | * - If the maximum distance between any nodes is 1 hop, the system | ||
6364 | * is directly connected. | ||
6365 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
6366 | * there is an intermediary node C, which is < N hops away from both | ||
6367 | * nodes A and B, the system is a glueless mesh. | ||
6368 | */ | ||
6369 | static void init_numa_topology_type(void) | ||
6370 | { | ||
6371 | int a, b, c, n; | ||
6372 | |||
6373 | n = sched_max_numa_distance; | ||
6374 | |||
6375 | if (n <= 1) | ||
6376 | sched_numa_topology_type = NUMA_DIRECT; | ||
6377 | |||
6378 | for_each_online_node(a) { | ||
6379 | for_each_online_node(b) { | ||
6380 | /* Find two nodes furthest removed from each other. */ | ||
6381 | if (node_distance(a, b) < n) | ||
6382 | continue; | ||
6383 | |||
6384 | /* Is there an intermediary node between a and b? */ | ||
6385 | for_each_online_node(c) { | ||
6386 | if (node_distance(a, c) < n && | ||
6387 | node_distance(b, c) < n) { | ||
6388 | sched_numa_topology_type = | ||
6389 | NUMA_GLUELESS_MESH; | ||
6390 | return; | ||
6391 | } | ||
6392 | } | ||
6393 | |||
6394 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
6395 | return; | ||
6396 | } | ||
6397 | } | ||
6398 | } | ||
6399 | |||
6293 | static void sched_init_numa(void) | 6400 | static void sched_init_numa(void) |
6294 | { | 6401 | { |
6295 | int next_distance, curr_distance = node_distance(0, 0); | 6402 | int next_distance, curr_distance = node_distance(0, 0); |
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void) | |||
6426 | sched_domain_topology = tl; | 6533 | sched_domain_topology = tl; |
6427 | 6534 | ||
6428 | sched_domains_numa_levels = level; | 6535 | sched_domains_numa_levels = level; |
6536 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
6537 | |||
6538 | init_numa_topology_type(); | ||
6429 | } | 6539 | } |
6430 | 6540 | ||
6431 | static void sched_domains_numa_masks_set(int cpu) | 6541 | static void sched_domains_numa_masks_set(int cpu) |
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset) | |||
7178 | 7288 | ||
7179 | void __might_sleep(const char *file, int line, int preempt_offset) | 7289 | void __might_sleep(const char *file, int line, int preempt_offset) |
7180 | { | 7290 | { |
7291 | /* | ||
7292 | * Blocking primitives will set (and therefore destroy) current->state, | ||
7293 | * since we will exit with TASK_RUNNING make sure we enter with it, | ||
7294 | * otherwise we will destroy state. | ||
7295 | */ | ||
7296 | if (WARN_ONCE(current->state != TASK_RUNNING, | ||
7297 | "do not call blocking ops when !TASK_RUNNING; " | ||
7298 | "state=%lx set at [<%p>] %pS\n", | ||
7299 | current->state, | ||
7300 | (void *)current->task_state_change, | ||
7301 | (void *)current->task_state_change)) | ||
7302 | __set_current_state(TASK_RUNNING); | ||
7303 | |||
7304 | ___might_sleep(file, line, preempt_offset); | ||
7305 | } | ||
7306 | EXPORT_SYMBOL(__might_sleep); | ||
7307 | |||
7308 | void ___might_sleep(const char *file, int line, int preempt_offset) | ||
7309 | { | ||
7181 | static unsigned long prev_jiffy; /* ratelimiting */ | 7310 | static unsigned long prev_jiffy; /* ratelimiting */ |
7182 | 7311 | ||
7183 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7312 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
7209 | #endif | 7338 | #endif |
7210 | dump_stack(); | 7339 | dump_stack(); |
7211 | } | 7340 | } |
7212 | EXPORT_SYMBOL(__might_sleep); | 7341 | EXPORT_SYMBOL(___might_sleep); |
7213 | #endif | 7342 | #endif |
7214 | 7343 | ||
7215 | #ifdef CONFIG_MAGIC_SYSRQ | 7344 | #ifdef CONFIG_MAGIC_SYSRQ |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); |
26 | int cpudl_init(struct cpudl *cp); | 26 | int cpudl_init(struct cpudl *cp); |
27 | void cpudl_cleanup(struct cpudl *cp); | 27 | void cpudl_cleanup(struct cpudl *cp); |
28 | #else | ||
29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
30 | #define cpudl_init() do { } while (0) | ||
31 | #endif /* CONFIG_SMP */ | 28 | #endif /* CONFIG_SMP */ |
32 | 29 | ||
33 | #endif /* _LINUX_CPUDL_H */ | 30 | #endif /* _LINUX_CPUDL_H */ |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, | |||
26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
27 | int cpupri_init(struct cpupri *cp); | 27 | int cpupri_init(struct cpupri *cp); |
28 | void cpupri_cleanup(struct cpupri *cp); | 28 | void cpupri_cleanup(struct cpupri *cp); |
29 | #else | ||
30 | #define cpupri_set(cp, cpu, pri) do { } while (0) | ||
31 | #define cpupri_init() do { } while (0) | ||
32 | #endif | 29 | #endif |
33 | 30 | ||
34 | #endif /* _LINUX_CPUPRI_H */ | 31 | #endif /* _LINUX_CPUPRI_H */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 28fa9d9e9201..e5db8c6feebd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
563 | { | 563 | { |
564 | struct hrtimer *timer = &dl_se->dl_timer; | 564 | struct hrtimer *timer = &dl_se->dl_timer; |
565 | 565 | ||
566 | if (hrtimer_active(timer)) { | ||
567 | hrtimer_try_to_cancel(timer); | ||
568 | return; | ||
569 | } | ||
570 | |||
571 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 566 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
572 | timer->function = dl_task_timer; | 567 | timer->function = dl_task_timer; |
573 | } | 568 | } |
@@ -633,7 +628,7 @@ static void update_curr_dl(struct rq *rq) | |||
633 | 628 | ||
634 | sched_rt_avg_update(rq, delta_exec); | 629 | sched_rt_avg_update(rq, delta_exec); |
635 | 630 | ||
636 | dl_se->runtime -= delta_exec; | 631 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; |
637 | if (dl_runtime_exceeded(rq, dl_se)) { | 632 | if (dl_runtime_exceeded(rq, dl_se)) { |
638 | __dequeue_task_dl(rq, curr, 0); | 633 | __dequeue_task_dl(rq, curr, 0); |
639 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | 634 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) |
@@ -933,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
933 | struct task_struct *curr; | 928 | struct task_struct *curr; |
934 | struct rq *rq; | 929 | struct rq *rq; |
935 | 930 | ||
936 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 931 | if (sd_flag != SD_BALANCE_WAKE) |
937 | goto out; | 932 | goto out; |
938 | 933 | ||
939 | rq = cpu_rq(cpu); | 934 | rq = cpu_rq(cpu); |
@@ -1018,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | |||
1018 | { | 1013 | { |
1019 | hrtick_start(rq, p->dl.runtime); | 1014 | hrtick_start(rq, p->dl.runtime); |
1020 | } | 1015 | } |
1016 | #else /* !CONFIG_SCHED_HRTICK */ | ||
1017 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
1018 | { | ||
1019 | } | ||
1021 | #endif | 1020 | #endif |
1022 | 1021 | ||
1023 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | 1022 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, |
@@ -1071,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
1071 | /* Running task will never be pushed. */ | 1070 | /* Running task will never be pushed. */ |
1072 | dequeue_pushable_dl_task(rq, p); | 1071 | dequeue_pushable_dl_task(rq, p); |
1073 | 1072 | ||
1074 | #ifdef CONFIG_SCHED_HRTICK | ||
1075 | if (hrtick_enabled(rq)) | 1073 | if (hrtick_enabled(rq)) |
1076 | start_hrtick_dl(rq, p); | 1074 | start_hrtick_dl(rq, p); |
1077 | #endif | ||
1078 | 1075 | ||
1079 | set_post_schedule(rq); | 1076 | set_post_schedule(rq); |
1080 | 1077 | ||
@@ -1093,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
1093 | { | 1090 | { |
1094 | update_curr_dl(rq); | 1091 | update_curr_dl(rq); |
1095 | 1092 | ||
1096 | #ifdef CONFIG_SCHED_HRTICK | ||
1097 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | 1093 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) |
1098 | start_hrtick_dl(rq, p); | 1094 | start_hrtick_dl(rq, p); |
1099 | #endif | ||
1100 | } | 1095 | } |
1101 | 1096 | ||
1102 | static void task_fork_dl(struct task_struct *p) | 1097 | static void task_fork_dl(struct task_struct *p) |
@@ -1333,6 +1328,7 @@ static int push_dl_task(struct rq *rq) | |||
1333 | { | 1328 | { |
1334 | struct task_struct *next_task; | 1329 | struct task_struct *next_task; |
1335 | struct rq *later_rq; | 1330 | struct rq *later_rq; |
1331 | int ret = 0; | ||
1336 | 1332 | ||
1337 | if (!rq->dl.overloaded) | 1333 | if (!rq->dl.overloaded) |
1338 | return 0; | 1334 | return 0; |
@@ -1378,7 +1374,6 @@ retry: | |||
1378 | * The task is still there. We don't try | 1374 | * The task is still there. We don't try |
1379 | * again, some other cpu will pull it when ready. | 1375 | * again, some other cpu will pull it when ready. |
1380 | */ | 1376 | */ |
1381 | dequeue_pushable_dl_task(rq, next_task); | ||
1382 | goto out; | 1377 | goto out; |
1383 | } | 1378 | } |
1384 | 1379 | ||
@@ -1394,6 +1389,7 @@ retry: | |||
1394 | deactivate_task(rq, next_task, 0); | 1389 | deactivate_task(rq, next_task, 0); |
1395 | set_task_cpu(next_task, later_rq->cpu); | 1390 | set_task_cpu(next_task, later_rq->cpu); |
1396 | activate_task(later_rq, next_task, 0); | 1391 | activate_task(later_rq, next_task, 0); |
1392 | ret = 1; | ||
1397 | 1393 | ||
1398 | resched_curr(later_rq); | 1394 | resched_curr(later_rq); |
1399 | 1395 | ||
@@ -1402,7 +1398,7 @@ retry: | |||
1402 | out: | 1398 | out: |
1403 | put_task_struct(next_task); | 1399 | put_task_struct(next_task); |
1404 | 1400 | ||
1405 | return 1; | 1401 | return ret; |
1406 | } | 1402 | } |
1407 | 1403 | ||
1408 | static void push_dl_tasks(struct rq *rq) | 1404 | static void push_dl_tasks(struct rq *rq) |
@@ -1508,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | |||
1508 | p->nr_cpus_allowed > 1 && | 1504 | p->nr_cpus_allowed > 1 && |
1509 | dl_task(rq->curr) && | 1505 | dl_task(rq->curr) && |
1510 | (rq->curr->nr_cpus_allowed < 2 || | 1506 | (rq->curr->nr_cpus_allowed < 2 || |
1511 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | 1507 | !dl_entity_preempt(&p->dl, &rq->curr->dl))) { |
1512 | push_dl_tasks(rq); | 1508 | push_dl_tasks(rq); |
1513 | } | 1509 | } |
1514 | } | 1510 | } |
@@ -1517,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
1517 | const struct cpumask *new_mask) | 1513 | const struct cpumask *new_mask) |
1518 | { | 1514 | { |
1519 | struct rq *rq; | 1515 | struct rq *rq; |
1516 | struct root_domain *src_rd; | ||
1520 | int weight; | 1517 | int weight; |
1521 | 1518 | ||
1522 | BUG_ON(!dl_task(p)); | 1519 | BUG_ON(!dl_task(p)); |
1523 | 1520 | ||
1521 | rq = task_rq(p); | ||
1522 | src_rd = rq->rd; | ||
1523 | /* | ||
1524 | * Migrating a SCHED_DEADLINE task between exclusive | ||
1525 | * cpusets (different root_domains) entails a bandwidth | ||
1526 | * update. We already made space for us in the destination | ||
1527 | * domain (see cpuset_can_attach()). | ||
1528 | */ | ||
1529 | if (!cpumask_intersects(src_rd->span, new_mask)) { | ||
1530 | struct dl_bw *src_dl_b; | ||
1531 | |||
1532 | src_dl_b = dl_bw_of(cpu_of(rq)); | ||
1533 | /* | ||
1534 | * We now free resources of the root_domain we are migrating | ||
1535 | * off. In the worst case, sched_setattr() may temporary fail | ||
1536 | * until we complete the update. | ||
1537 | */ | ||
1538 | raw_spin_lock(&src_dl_b->lock); | ||
1539 | __dl_clear(src_dl_b, p->dl.dl_bw); | ||
1540 | raw_spin_unlock(&src_dl_b->lock); | ||
1541 | } | ||
1542 | |||
1524 | /* | 1543 | /* |
1525 | * Update only if the task is actually running (i.e., | 1544 | * Update only if the task is actually running (i.e., |
1526 | * it is on the rq AND it is not throttled). | 1545 | * it is on the rq AND it is not throttled). |
@@ -1537,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
1537 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | 1556 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1538 | return; | 1557 | return; |
1539 | 1558 | ||
1540 | rq = task_rq(p); | ||
1541 | |||
1542 | /* | 1559 | /* |
1543 | * The process used to be able to migrate OR it can now migrate | 1560 | * The process used to be able to migrate OR it can now migrate |
1544 | */ | 1561 | */ |
@@ -1586,22 +1603,48 @@ void init_sched_dl_class(void) | |||
1586 | 1603 | ||
1587 | #endif /* CONFIG_SMP */ | 1604 | #endif /* CONFIG_SMP */ |
1588 | 1605 | ||
1606 | /* | ||
1607 | * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. | ||
1608 | */ | ||
1609 | static void cancel_dl_timer(struct rq *rq, struct task_struct *p) | ||
1610 | { | ||
1611 | struct hrtimer *dl_timer = &p->dl.dl_timer; | ||
1612 | |||
1613 | /* Nobody will change task's class if pi_lock is held */ | ||
1614 | lockdep_assert_held(&p->pi_lock); | ||
1615 | |||
1616 | if (hrtimer_active(dl_timer)) { | ||
1617 | int ret = hrtimer_try_to_cancel(dl_timer); | ||
1618 | |||
1619 | if (unlikely(ret == -1)) { | ||
1620 | /* | ||
1621 | * Note, p may migrate OR new deadline tasks | ||
1622 | * may appear in rq when we are unlocking it. | ||
1623 | * A caller of us must be fine with that. | ||
1624 | */ | ||
1625 | raw_spin_unlock(&rq->lock); | ||
1626 | hrtimer_cancel(dl_timer); | ||
1627 | raw_spin_lock(&rq->lock); | ||
1628 | } | ||
1629 | } | ||
1630 | } | ||
1631 | |||
1589 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | 1632 | static void switched_from_dl(struct rq *rq, struct task_struct *p) |
1590 | { | 1633 | { |
1591 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1634 | cancel_dl_timer(rq, p); |
1592 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
1593 | 1635 | ||
1594 | __dl_clear_params(p); | 1636 | __dl_clear_params(p); |
1595 | 1637 | ||
1596 | #ifdef CONFIG_SMP | ||
1597 | /* | 1638 | /* |
1598 | * Since this might be the only -deadline task on the rq, | 1639 | * Since this might be the only -deadline task on the rq, |
1599 | * this is the right place to try to pull some other one | 1640 | * this is the right place to try to pull some other one |
1600 | * from an overloaded cpu, if any. | 1641 | * from an overloaded cpu, if any. |
1601 | */ | 1642 | */ |
1602 | if (!rq->dl.dl_nr_running) | 1643 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
1603 | pull_dl_task(rq); | 1644 | return; |
1604 | #endif | 1645 | |
1646 | if (pull_dl_task(rq)) | ||
1647 | resched_curr(rq); | ||
1605 | } | 1648 | } |
1606 | 1649 | ||
1607 | /* | 1650 | /* |
@@ -1622,7 +1665,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1622 | 1665 | ||
1623 | if (task_on_rq_queued(p) && rq->curr != p) { | 1666 | if (task_on_rq_queued(p) && rq->curr != p) { |
1624 | #ifdef CONFIG_SMP | 1667 | #ifdef CONFIG_SMP |
1625 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1668 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
1669 | push_dl_task(rq) && rq != task_rq(p)) | ||
1626 | /* Only reschedule if pushing failed */ | 1670 | /* Only reschedule if pushing failed */ |
1627 | check_resched = 0; | 1671 | check_resched = 0; |
1628 | #endif /* CONFIG_SMP */ | 1672 | #endif /* CONFIG_SMP */ |
@@ -1704,3 +1748,12 @@ const struct sched_class dl_sched_class = { | |||
1704 | 1748 | ||
1705 | .update_curr = update_curr_dl, | 1749 | .update_curr = update_curr_dl, |
1706 | }; | 1750 | }; |
1751 | |||
1752 | #ifdef CONFIG_SCHED_DEBUG | ||
1753 | extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); | ||
1754 | |||
1755 | void print_dl_stats(struct seq_file *m, int cpu) | ||
1756 | { | ||
1757 | print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); | ||
1758 | } | ||
1759 | #endif /* CONFIG_SCHED_DEBUG */ | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f20..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
261 | #undef P | 261 | #undef P |
262 | } | 262 | } |
263 | 263 | ||
264 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | ||
265 | { | ||
266 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | ||
267 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | ||
268 | } | ||
269 | |||
264 | extern __read_mostly int sched_clock_running; | 270 | extern __read_mostly int sched_clock_running; |
265 | 271 | ||
266 | static void print_cpu(struct seq_file *m, int cpu) | 272 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -329,6 +335,7 @@ do { \ | |||
329 | spin_lock_irqsave(&sched_debug_lock, flags); | 335 | spin_lock_irqsave(&sched_debug_lock, flags); |
330 | print_cfs_stats(m, cpu); | 336 | print_cfs_stats(m, cpu); |
331 | print_rt_stats(m, cpu); | 337 | print_rt_stats(m, cpu); |
338 | print_dl_stats(m, cpu); | ||
332 | 339 | ||
333 | print_rq(m, rq, cpu); | 340 | print_rq(m, rq, cpu); |
334 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 341 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) | |||
528 | unsigned long nr_faults = -1; | 535 | unsigned long nr_faults = -1; |
529 | int cpu_current, home_node; | 536 | int cpu_current, home_node; |
530 | 537 | ||
531 | if (p->numa_faults_memory) | 538 | if (p->numa_faults) |
532 | nr_faults = p->numa_faults_memory[2*node + i]; | 539 | nr_faults = p->numa_faults[2*node + i]; |
533 | 540 | ||
534 | cpu_current = !i ? (task_node(p) == node) : | 541 | cpu_current = !i ? (task_node(p) == node) : |
535 | (pol && node_isset(node, pol->v.nodes)); | 542 | (pol && node_isset(node, pol->v.nodes)); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef2b104b254c..df2cdf77f899 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -873,7 +873,6 @@ struct numa_group { | |||
873 | spinlock_t lock; /* nr_tasks, tasks */ | 873 | spinlock_t lock; /* nr_tasks, tasks */ |
874 | int nr_tasks; | 874 | int nr_tasks; |
875 | pid_t gid; | 875 | pid_t gid; |
876 | struct list_head task_list; | ||
877 | 876 | ||
878 | struct rcu_head rcu; | 877 | struct rcu_head rcu; |
879 | nodemask_t active_nodes; | 878 | nodemask_t active_nodes; |
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
901 | return p->numa_group ? p->numa_group->gid : 0; | 900 | return p->numa_group ? p->numa_group->gid : 0; |
902 | } | 901 | } |
903 | 902 | ||
904 | static inline int task_faults_idx(int nid, int priv) | 903 | /* |
904 | * The averaged statistics, shared & private, memory & cpu, | ||
905 | * occupy the first half of the array. The second half of the | ||
906 | * array is for current counters, which are averaged into the | ||
907 | * first set by task_numa_placement. | ||
908 | */ | ||
909 | static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) | ||
905 | { | 910 | { |
906 | return NR_NUMA_HINT_FAULT_TYPES * nid + priv; | 911 | return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; |
907 | } | 912 | } |
908 | 913 | ||
909 | static inline unsigned long task_faults(struct task_struct *p, int nid) | 914 | static inline unsigned long task_faults(struct task_struct *p, int nid) |
910 | { | 915 | { |
911 | if (!p->numa_faults_memory) | 916 | if (!p->numa_faults) |
912 | return 0; | 917 | return 0; |
913 | 918 | ||
914 | return p->numa_faults_memory[task_faults_idx(nid, 0)] + | 919 | return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
915 | p->numa_faults_memory[task_faults_idx(nid, 1)]; | 920 | p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
916 | } | 921 | } |
917 | 922 | ||
918 | static inline unsigned long group_faults(struct task_struct *p, int nid) | 923 | static inline unsigned long group_faults(struct task_struct *p, int nid) |
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
920 | if (!p->numa_group) | 925 | if (!p->numa_group) |
921 | return 0; | 926 | return 0; |
922 | 927 | ||
923 | return p->numa_group->faults[task_faults_idx(nid, 0)] + | 928 | return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
924 | p->numa_group->faults[task_faults_idx(nid, 1)]; | 929 | p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
925 | } | 930 | } |
926 | 931 | ||
927 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | 932 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) |
928 | { | 933 | { |
929 | return group->faults_cpu[task_faults_idx(nid, 0)] + | 934 | return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + |
930 | group->faults_cpu[task_faults_idx(nid, 1)]; | 935 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; |
936 | } | ||
937 | |||
938 | /* Handle placement on systems where not all nodes are directly connected. */ | ||
939 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | ||
940 | int maxdist, bool task) | ||
941 | { | ||
942 | unsigned long score = 0; | ||
943 | int node; | ||
944 | |||
945 | /* | ||
946 | * All nodes are directly connected, and the same distance | ||
947 | * from each other. No need for fancy placement algorithms. | ||
948 | */ | ||
949 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
950 | return 0; | ||
951 | |||
952 | /* | ||
953 | * This code is called for each node, introducing N^2 complexity, | ||
954 | * which should be ok given the number of nodes rarely exceeds 8. | ||
955 | */ | ||
956 | for_each_online_node(node) { | ||
957 | unsigned long faults; | ||
958 | int dist = node_distance(nid, node); | ||
959 | |||
960 | /* | ||
961 | * The furthest away nodes in the system are not interesting | ||
962 | * for placement; nid was already counted. | ||
963 | */ | ||
964 | if (dist == sched_max_numa_distance || node == nid) | ||
965 | continue; | ||
966 | |||
967 | /* | ||
968 | * On systems with a backplane NUMA topology, compare groups | ||
969 | * of nodes, and move tasks towards the group with the most | ||
970 | * memory accesses. When comparing two nodes at distance | ||
971 | * "hoplimit", only nodes closer by than "hoplimit" are part | ||
972 | * of each group. Skip other nodes. | ||
973 | */ | ||
974 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
975 | dist > maxdist) | ||
976 | continue; | ||
977 | |||
978 | /* Add up the faults from nearby nodes. */ | ||
979 | if (task) | ||
980 | faults = task_faults(p, node); | ||
981 | else | ||
982 | faults = group_faults(p, node); | ||
983 | |||
984 | /* | ||
985 | * On systems with a glueless mesh NUMA topology, there are | ||
986 | * no fixed "groups of nodes". Instead, nodes that are not | ||
987 | * directly connected bounce traffic through intermediate | ||
988 | * nodes; a numa_group can occupy any set of nodes. | ||
989 | * The further away a node is, the less the faults count. | ||
990 | * This seems to result in good task placement. | ||
991 | */ | ||
992 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
993 | faults *= (sched_max_numa_distance - dist); | ||
994 | faults /= (sched_max_numa_distance - LOCAL_DISTANCE); | ||
995 | } | ||
996 | |||
997 | score += faults; | ||
998 | } | ||
999 | |||
1000 | return score; | ||
931 | } | 1001 | } |
932 | 1002 | ||
933 | /* | 1003 | /* |
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | |||
936 | * larger multiplier, in order to group tasks together that are almost | 1006 | * larger multiplier, in order to group tasks together that are almost |
937 | * evenly spread out between numa nodes. | 1007 | * evenly spread out between numa nodes. |
938 | */ | 1008 | */ |
939 | static inline unsigned long task_weight(struct task_struct *p, int nid) | 1009 | static inline unsigned long task_weight(struct task_struct *p, int nid, |
1010 | int dist) | ||
940 | { | 1011 | { |
941 | unsigned long total_faults; | 1012 | unsigned long faults, total_faults; |
942 | 1013 | ||
943 | if (!p->numa_faults_memory) | 1014 | if (!p->numa_faults) |
944 | return 0; | 1015 | return 0; |
945 | 1016 | ||
946 | total_faults = p->total_numa_faults; | 1017 | total_faults = p->total_numa_faults; |
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) | |||
948 | if (!total_faults) | 1019 | if (!total_faults) |
949 | return 0; | 1020 | return 0; |
950 | 1021 | ||
951 | return 1000 * task_faults(p, nid) / total_faults; | 1022 | faults = task_faults(p, nid); |
1023 | faults += score_nearby_nodes(p, nid, dist, true); | ||
1024 | |||
1025 | return 1000 * faults / total_faults; | ||
952 | } | 1026 | } |
953 | 1027 | ||
954 | static inline unsigned long group_weight(struct task_struct *p, int nid) | 1028 | static inline unsigned long group_weight(struct task_struct *p, int nid, |
1029 | int dist) | ||
955 | { | 1030 | { |
956 | if (!p->numa_group || !p->numa_group->total_faults) | 1031 | unsigned long faults, total_faults; |
1032 | |||
1033 | if (!p->numa_group) | ||
957 | return 0; | 1034 | return 0; |
958 | 1035 | ||
959 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | 1036 | total_faults = p->numa_group->total_faults; |
1037 | |||
1038 | if (!total_faults) | ||
1039 | return 0; | ||
1040 | |||
1041 | faults = group_faults(p, nid); | ||
1042 | faults += score_nearby_nodes(p, nid, dist, false); | ||
1043 | |||
1044 | return 1000 * faults / total_faults; | ||
960 | } | 1045 | } |
961 | 1046 | ||
962 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | 1047 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, |
@@ -1089,6 +1174,7 @@ struct task_numa_env { | |||
1089 | struct numa_stats src_stats, dst_stats; | 1174 | struct numa_stats src_stats, dst_stats; |
1090 | 1175 | ||
1091 | int imbalance_pct; | 1176 | int imbalance_pct; |
1177 | int dist; | ||
1092 | 1178 | ||
1093 | struct task_struct *best_task; | 1179 | struct task_struct *best_task; |
1094 | long best_imp; | 1180 | long best_imp; |
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1168 | long load; | 1254 | long load; |
1169 | long imp = env->p->numa_group ? groupimp : taskimp; | 1255 | long imp = env->p->numa_group ? groupimp : taskimp; |
1170 | long moveimp = imp; | 1256 | long moveimp = imp; |
1257 | int dist = env->dist; | ||
1171 | 1258 | ||
1172 | rcu_read_lock(); | 1259 | rcu_read_lock(); |
1173 | 1260 | ||
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1208 | * in any group then look only at task weights. | 1295 | * in any group then look only at task weights. |
1209 | */ | 1296 | */ |
1210 | if (cur->numa_group == env->p->numa_group) { | 1297 | if (cur->numa_group == env->p->numa_group) { |
1211 | imp = taskimp + task_weight(cur, env->src_nid) - | 1298 | imp = taskimp + task_weight(cur, env->src_nid, dist) - |
1212 | task_weight(cur, env->dst_nid); | 1299 | task_weight(cur, env->dst_nid, dist); |
1213 | /* | 1300 | /* |
1214 | * Add some hysteresis to prevent swapping the | 1301 | * Add some hysteresis to prevent swapping the |
1215 | * tasks within a group over tiny differences. | 1302 | * tasks within a group over tiny differences. |
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1223 | * instead. | 1310 | * instead. |
1224 | */ | 1311 | */ |
1225 | if (cur->numa_group) | 1312 | if (cur->numa_group) |
1226 | imp += group_weight(cur, env->src_nid) - | 1313 | imp += group_weight(cur, env->src_nid, dist) - |
1227 | group_weight(cur, env->dst_nid); | 1314 | group_weight(cur, env->dst_nid, dist); |
1228 | else | 1315 | else |
1229 | imp += task_weight(cur, env->src_nid) - | 1316 | imp += task_weight(cur, env->src_nid, dist) - |
1230 | task_weight(cur, env->dst_nid); | 1317 | task_weight(cur, env->dst_nid, dist); |
1231 | } | 1318 | } |
1232 | } | 1319 | } |
1233 | 1320 | ||
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1326 | }; | 1413 | }; |
1327 | struct sched_domain *sd; | 1414 | struct sched_domain *sd; |
1328 | unsigned long taskweight, groupweight; | 1415 | unsigned long taskweight, groupweight; |
1329 | int nid, ret; | 1416 | int nid, ret, dist; |
1330 | long taskimp, groupimp; | 1417 | long taskimp, groupimp; |
1331 | 1418 | ||
1332 | /* | 1419 | /* |
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) | |||
1354 | return -EINVAL; | 1441 | return -EINVAL; |
1355 | } | 1442 | } |
1356 | 1443 | ||
1357 | taskweight = task_weight(p, env.src_nid); | ||
1358 | groupweight = group_weight(p, env.src_nid); | ||
1359 | update_numa_stats(&env.src_stats, env.src_nid); | ||
1360 | env.dst_nid = p->numa_preferred_nid; | 1444 | env.dst_nid = p->numa_preferred_nid; |
1361 | taskimp = task_weight(p, env.dst_nid) - taskweight; | 1445 | dist = env.dist = node_distance(env.src_nid, env.dst_nid); |
1362 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1446 | taskweight = task_weight(p, env.src_nid, dist); |
1447 | groupweight = group_weight(p, env.src_nid, dist); | ||
1448 | update_numa_stats(&env.src_stats, env.src_nid); | ||
1449 | taskimp = task_weight(p, env.dst_nid, dist) - taskweight; | ||
1450 | groupimp = group_weight(p, env.dst_nid, dist) - groupweight; | ||
1363 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1451 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1364 | 1452 | ||
1365 | /* Try to find a spot on the preferred nid. */ | 1453 | /* Try to find a spot on the preferred nid. */ |
1366 | task_numa_find_cpu(&env, taskimp, groupimp); | 1454 | task_numa_find_cpu(&env, taskimp, groupimp); |
1367 | 1455 | ||
1368 | /* No space available on the preferred nid. Look elsewhere. */ | 1456 | /* |
1369 | if (env.best_cpu == -1) { | 1457 | * Look at other nodes in these cases: |
1458 | * - there is no space available on the preferred_nid | ||
1459 | * - the task is part of a numa_group that is interleaved across | ||
1460 | * multiple NUMA nodes; in order to better consolidate the group, | ||
1461 | * we need to check other locations. | ||
1462 | */ | ||
1463 | if (env.best_cpu == -1 || (p->numa_group && | ||
1464 | nodes_weight(p->numa_group->active_nodes) > 1)) { | ||
1370 | for_each_online_node(nid) { | 1465 | for_each_online_node(nid) { |
1371 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1466 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
1372 | continue; | 1467 | continue; |
1373 | 1468 | ||
1469 | dist = node_distance(env.src_nid, env.dst_nid); | ||
1470 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
1471 | dist != env.dist) { | ||
1472 | taskweight = task_weight(p, env.src_nid, dist); | ||
1473 | groupweight = group_weight(p, env.src_nid, dist); | ||
1474 | } | ||
1475 | |||
1374 | /* Only consider nodes where both task and groups benefit */ | 1476 | /* Only consider nodes where both task and groups benefit */ |
1375 | taskimp = task_weight(p, nid) - taskweight; | 1477 | taskimp = task_weight(p, nid, dist) - taskweight; |
1376 | groupimp = group_weight(p, nid) - groupweight; | 1478 | groupimp = group_weight(p, nid, dist) - groupweight; |
1377 | if (taskimp < 0 && groupimp < 0) | 1479 | if (taskimp < 0 && groupimp < 0) |
1378 | continue; | 1480 | continue; |
1379 | 1481 | ||
1482 | env.dist = dist; | ||
1380 | env.dst_nid = nid; | 1483 | env.dst_nid = nid; |
1381 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1484 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1382 | task_numa_find_cpu(&env, taskimp, groupimp); | 1485 | task_numa_find_cpu(&env, taskimp, groupimp); |
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1431 | unsigned long interval = HZ; | 1534 | unsigned long interval = HZ; |
1432 | 1535 | ||
1433 | /* This task has no NUMA fault statistics yet */ | 1536 | /* This task has no NUMA fault statistics yet */ |
1434 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) | 1537 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
1435 | return; | 1538 | return; |
1436 | 1539 | ||
1437 | /* Periodically retry migrating the task to the preferred node */ | 1540 | /* Periodically retry migrating the task to the preferred node */ |
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
1580 | return delta; | 1683 | return delta; |
1581 | } | 1684 | } |
1582 | 1685 | ||
1686 | /* | ||
1687 | * Determine the preferred nid for a task in a numa_group. This needs to | ||
1688 | * be done in a way that produces consistent results with group_weight, | ||
1689 | * otherwise workloads might not converge. | ||
1690 | */ | ||
1691 | static int preferred_group_nid(struct task_struct *p, int nid) | ||
1692 | { | ||
1693 | nodemask_t nodes; | ||
1694 | int dist; | ||
1695 | |||
1696 | /* Direct connections between all NUMA nodes. */ | ||
1697 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
1698 | return nid; | ||
1699 | |||
1700 | /* | ||
1701 | * On a system with glueless mesh NUMA topology, group_weight | ||
1702 | * scores nodes according to the number of NUMA hinting faults on | ||
1703 | * both the node itself, and on nearby nodes. | ||
1704 | */ | ||
1705 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
1706 | unsigned long score, max_score = 0; | ||
1707 | int node, max_node = nid; | ||
1708 | |||
1709 | dist = sched_max_numa_distance; | ||
1710 | |||
1711 | for_each_online_node(node) { | ||
1712 | score = group_weight(p, node, dist); | ||
1713 | if (score > max_score) { | ||
1714 | max_score = score; | ||
1715 | max_node = node; | ||
1716 | } | ||
1717 | } | ||
1718 | return max_node; | ||
1719 | } | ||
1720 | |||
1721 | /* | ||
1722 | * Finding the preferred nid in a system with NUMA backplane | ||
1723 | * interconnect topology is more involved. The goal is to locate | ||
1724 | * tasks from numa_groups near each other in the system, and | ||
1725 | * untangle workloads from different sides of the system. This requires | ||
1726 | * searching down the hierarchy of node groups, recursively searching | ||
1727 | * inside the highest scoring group of nodes. The nodemask tricks | ||
1728 | * keep the complexity of the search down. | ||
1729 | */ | ||
1730 | nodes = node_online_map; | ||
1731 | for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { | ||
1732 | unsigned long max_faults = 0; | ||
1733 | nodemask_t max_group; | ||
1734 | int a, b; | ||
1735 | |||
1736 | /* Are there nodes at this distance from each other? */ | ||
1737 | if (!find_numa_distance(dist)) | ||
1738 | continue; | ||
1739 | |||
1740 | for_each_node_mask(a, nodes) { | ||
1741 | unsigned long faults = 0; | ||
1742 | nodemask_t this_group; | ||
1743 | nodes_clear(this_group); | ||
1744 | |||
1745 | /* Sum group's NUMA faults; includes a==b case. */ | ||
1746 | for_each_node_mask(b, nodes) { | ||
1747 | if (node_distance(a, b) < dist) { | ||
1748 | faults += group_faults(p, b); | ||
1749 | node_set(b, this_group); | ||
1750 | node_clear(b, nodes); | ||
1751 | } | ||
1752 | } | ||
1753 | |||
1754 | /* Remember the top group. */ | ||
1755 | if (faults > max_faults) { | ||
1756 | max_faults = faults; | ||
1757 | max_group = this_group; | ||
1758 | /* | ||
1759 | * subtle: at the smallest distance there is | ||
1760 | * just one node left in each "group", the | ||
1761 | * winner is the preferred nid. | ||
1762 | */ | ||
1763 | nid = a; | ||
1764 | } | ||
1765 | } | ||
1766 | /* Next round, evaluate the nodes within max_group. */ | ||
1767 | nodes = max_group; | ||
1768 | } | ||
1769 | return nid; | ||
1770 | } | ||
1771 | |||
1583 | static void task_numa_placement(struct task_struct *p) | 1772 | static void task_numa_placement(struct task_struct *p) |
1584 | { | 1773 | { |
1585 | int seq, nid, max_nid = -1, max_group_nid = -1; | 1774 | int seq, nid, max_nid = -1, max_group_nid = -1; |
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) | |||
1607 | 1796 | ||
1608 | /* Find the node with the highest number of faults */ | 1797 | /* Find the node with the highest number of faults */ |
1609 | for_each_online_node(nid) { | 1798 | for_each_online_node(nid) { |
1799 | /* Keep track of the offsets in numa_faults array */ | ||
1800 | int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; | ||
1610 | unsigned long faults = 0, group_faults = 0; | 1801 | unsigned long faults = 0, group_faults = 0; |
1611 | int priv, i; | 1802 | int priv; |
1612 | 1803 | ||
1613 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { | 1804 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { |
1614 | long diff, f_diff, f_weight; | 1805 | long diff, f_diff, f_weight; |
1615 | 1806 | ||
1616 | i = task_faults_idx(nid, priv); | 1807 | mem_idx = task_faults_idx(NUMA_MEM, nid, priv); |
1808 | membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); | ||
1809 | cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); | ||
1810 | cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); | ||
1617 | 1811 | ||
1618 | /* Decay existing window, copy faults since last scan */ | 1812 | /* Decay existing window, copy faults since last scan */ |
1619 | diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; | 1813 | diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; |
1620 | fault_types[priv] += p->numa_faults_buffer_memory[i]; | 1814 | fault_types[priv] += p->numa_faults[membuf_idx]; |
1621 | p->numa_faults_buffer_memory[i] = 0; | 1815 | p->numa_faults[membuf_idx] = 0; |
1622 | 1816 | ||
1623 | /* | 1817 | /* |
1624 | * Normalize the faults_from, so all tasks in a group | 1818 | * Normalize the faults_from, so all tasks in a group |
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) | |||
1628 | * faults are less important. | 1822 | * faults are less important. |
1629 | */ | 1823 | */ |
1630 | f_weight = div64_u64(runtime << 16, period + 1); | 1824 | f_weight = div64_u64(runtime << 16, period + 1); |
1631 | f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / | 1825 | f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / |
1632 | (total_faults + 1); | 1826 | (total_faults + 1); |
1633 | f_diff = f_weight - p->numa_faults_cpu[i] / 2; | 1827 | f_diff = f_weight - p->numa_faults[cpu_idx] / 2; |
1634 | p->numa_faults_buffer_cpu[i] = 0; | 1828 | p->numa_faults[cpubuf_idx] = 0; |
1635 | 1829 | ||
1636 | p->numa_faults_memory[i] += diff; | 1830 | p->numa_faults[mem_idx] += diff; |
1637 | p->numa_faults_cpu[i] += f_diff; | 1831 | p->numa_faults[cpu_idx] += f_diff; |
1638 | faults += p->numa_faults_memory[i]; | 1832 | faults += p->numa_faults[mem_idx]; |
1639 | p->total_numa_faults += diff; | 1833 | p->total_numa_faults += diff; |
1640 | if (p->numa_group) { | 1834 | if (p->numa_group) { |
1641 | /* safe because we can only change our own group */ | 1835 | /* |
1642 | p->numa_group->faults[i] += diff; | 1836 | * safe because we can only change our own group |
1643 | p->numa_group->faults_cpu[i] += f_diff; | 1837 | * |
1838 | * mem_idx represents the offset for a given | ||
1839 | * nid and priv in a specific region because it | ||
1840 | * is at the beginning of the numa_faults array. | ||
1841 | */ | ||
1842 | p->numa_group->faults[mem_idx] += diff; | ||
1843 | p->numa_group->faults_cpu[mem_idx] += f_diff; | ||
1644 | p->numa_group->total_faults += diff; | 1844 | p->numa_group->total_faults += diff; |
1645 | group_faults += p->numa_group->faults[i]; | 1845 | group_faults += p->numa_group->faults[mem_idx]; |
1646 | } | 1846 | } |
1647 | } | 1847 | } |
1648 | 1848 | ||
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) | |||
1662 | if (p->numa_group) { | 1862 | if (p->numa_group) { |
1663 | update_numa_active_node_mask(p->numa_group); | 1863 | update_numa_active_node_mask(p->numa_group); |
1664 | spin_unlock_irq(group_lock); | 1864 | spin_unlock_irq(group_lock); |
1665 | max_nid = max_group_nid; | 1865 | max_nid = preferred_group_nid(p, max_group_nid); |
1666 | } | 1866 | } |
1667 | 1867 | ||
1668 | if (max_faults) { | 1868 | if (max_faults) { |
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1705 | 1905 | ||
1706 | atomic_set(&grp->refcount, 1); | 1906 | atomic_set(&grp->refcount, 1); |
1707 | spin_lock_init(&grp->lock); | 1907 | spin_lock_init(&grp->lock); |
1708 | INIT_LIST_HEAD(&grp->task_list); | ||
1709 | grp->gid = p->pid; | 1908 | grp->gid = p->pid; |
1710 | /* Second half of the array tracks nids where faults happen */ | 1909 | /* Second half of the array tracks nids where faults happen */ |
1711 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | 1910 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * |
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1714 | node_set(task_node(current), grp->active_nodes); | 1913 | node_set(task_node(current), grp->active_nodes); |
1715 | 1914 | ||
1716 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 1915 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
1717 | grp->faults[i] = p->numa_faults_memory[i]; | 1916 | grp->faults[i] = p->numa_faults[i]; |
1718 | 1917 | ||
1719 | grp->total_faults = p->total_numa_faults; | 1918 | grp->total_faults = p->total_numa_faults; |
1720 | 1919 | ||
1721 | list_add(&p->numa_entry, &grp->task_list); | ||
1722 | grp->nr_tasks++; | 1920 | grp->nr_tasks++; |
1723 | rcu_assign_pointer(p->numa_group, grp); | 1921 | rcu_assign_pointer(p->numa_group, grp); |
1724 | } | 1922 | } |
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1773 | double_lock_irq(&my_grp->lock, &grp->lock); | 1971 | double_lock_irq(&my_grp->lock, &grp->lock); |
1774 | 1972 | ||
1775 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { | 1973 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
1776 | my_grp->faults[i] -= p->numa_faults_memory[i]; | 1974 | my_grp->faults[i] -= p->numa_faults[i]; |
1777 | grp->faults[i] += p->numa_faults_memory[i]; | 1975 | grp->faults[i] += p->numa_faults[i]; |
1778 | } | 1976 | } |
1779 | my_grp->total_faults -= p->total_numa_faults; | 1977 | my_grp->total_faults -= p->total_numa_faults; |
1780 | grp->total_faults += p->total_numa_faults; | 1978 | grp->total_faults += p->total_numa_faults; |
1781 | 1979 | ||
1782 | list_move(&p->numa_entry, &grp->task_list); | ||
1783 | my_grp->nr_tasks--; | 1980 | my_grp->nr_tasks--; |
1784 | grp->nr_tasks++; | 1981 | grp->nr_tasks++; |
1785 | 1982 | ||
@@ -1799,27 +1996,23 @@ no_join: | |||
1799 | void task_numa_free(struct task_struct *p) | 1996 | void task_numa_free(struct task_struct *p) |
1800 | { | 1997 | { |
1801 | struct numa_group *grp = p->numa_group; | 1998 | struct numa_group *grp = p->numa_group; |
1802 | void *numa_faults = p->numa_faults_memory; | 1999 | void *numa_faults = p->numa_faults; |
1803 | unsigned long flags; | 2000 | unsigned long flags; |
1804 | int i; | 2001 | int i; |
1805 | 2002 | ||
1806 | if (grp) { | 2003 | if (grp) { |
1807 | spin_lock_irqsave(&grp->lock, flags); | 2004 | spin_lock_irqsave(&grp->lock, flags); |
1808 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 2005 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
1809 | grp->faults[i] -= p->numa_faults_memory[i]; | 2006 | grp->faults[i] -= p->numa_faults[i]; |
1810 | grp->total_faults -= p->total_numa_faults; | 2007 | grp->total_faults -= p->total_numa_faults; |
1811 | 2008 | ||
1812 | list_del(&p->numa_entry); | ||
1813 | grp->nr_tasks--; | 2009 | grp->nr_tasks--; |
1814 | spin_unlock_irqrestore(&grp->lock, flags); | 2010 | spin_unlock_irqrestore(&grp->lock, flags); |
1815 | RCU_INIT_POINTER(p->numa_group, NULL); | 2011 | RCU_INIT_POINTER(p->numa_group, NULL); |
1816 | put_numa_group(grp); | 2012 | put_numa_group(grp); |
1817 | } | 2013 | } |
1818 | 2014 | ||
1819 | p->numa_faults_memory = NULL; | 2015 | p->numa_faults = NULL; |
1820 | p->numa_faults_buffer_memory = NULL; | ||
1821 | p->numa_faults_cpu= NULL; | ||
1822 | p->numa_faults_buffer_cpu = NULL; | ||
1823 | kfree(numa_faults); | 2016 | kfree(numa_faults); |
1824 | } | 2017 | } |
1825 | 2018 | ||
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1842 | return; | 2035 | return; |
1843 | 2036 | ||
1844 | /* Allocate buffer to track faults on a per-node basis */ | 2037 | /* Allocate buffer to track faults on a per-node basis */ |
1845 | if (unlikely(!p->numa_faults_memory)) { | 2038 | if (unlikely(!p->numa_faults)) { |
1846 | int size = sizeof(*p->numa_faults_memory) * | 2039 | int size = sizeof(*p->numa_faults) * |
1847 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; | 2040 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; |
1848 | 2041 | ||
1849 | p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); | 2042 | p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); |
1850 | if (!p->numa_faults_memory) | 2043 | if (!p->numa_faults) |
1851 | return; | 2044 | return; |
1852 | 2045 | ||
1853 | BUG_ON(p->numa_faults_buffer_memory); | ||
1854 | /* | ||
1855 | * The averaged statistics, shared & private, memory & cpu, | ||
1856 | * occupy the first half of the array. The second half of the | ||
1857 | * array is for current counters, which are averaged into the | ||
1858 | * first set by task_numa_placement. | ||
1859 | */ | ||
1860 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); | ||
1861 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
1862 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
1863 | p->total_numa_faults = 0; | 2046 | p->total_numa_faults = 0; |
1864 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 2047 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
1865 | } | 2048 | } |
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1899 | if (migrated) | 2082 | if (migrated) |
1900 | p->numa_pages_migrated += pages; | 2083 | p->numa_pages_migrated += pages; |
1901 | 2084 | ||
1902 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; | 2085 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
1903 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | 2086 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
1904 | p->numa_faults_locality[local] += pages; | 2087 | p->numa_faults_locality[local] += pages; |
1905 | } | 2088 | } |
1906 | 2089 | ||
@@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
4469 | latest_idle_timestamp = rq->idle_stamp; | 4652 | latest_idle_timestamp = rq->idle_stamp; |
4470 | shallowest_idle_cpu = i; | 4653 | shallowest_idle_cpu = i; |
4471 | } | 4654 | } |
4472 | } else { | 4655 | } else if (shallowest_idle_cpu == -1) { |
4473 | load = weighted_cpuload(i); | 4656 | load = weighted_cpuload(i); |
4474 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4657 | if (load < min_load || (load == min_load && i == this_cpu)) { |
4475 | min_load = load; | 4658 | min_load = load; |
@@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4547 | int want_affine = 0; | 4730 | int want_affine = 0; |
4548 | int sync = wake_flags & WF_SYNC; | 4731 | int sync = wake_flags & WF_SYNC; |
4549 | 4732 | ||
4550 | if (p->nr_cpus_allowed == 1) | ||
4551 | return prev_cpu; | ||
4552 | |||
4553 | if (sd_flag & SD_BALANCE_WAKE) | 4733 | if (sd_flag & SD_BALANCE_WAKE) |
4554 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 4734 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
4555 | 4735 | ||
@@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
5189 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5369 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5190 | int src_nid, dst_nid; | 5370 | int src_nid, dst_nid; |
5191 | 5371 | ||
5192 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || | 5372 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || |
5193 | !(env->sd->flags & SD_NUMA)) { | 5373 | !(env->sd->flags & SD_NUMA)) { |
5194 | return false; | 5374 | return false; |
5195 | } | 5375 | } |
@@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5228 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5408 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
5229 | return false; | 5409 | return false; |
5230 | 5410 | ||
5231 | if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) | 5411 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
5232 | return false; | 5412 | return false; |
5233 | 5413 | ||
5234 | src_nid = cpu_to_node(env->src_cpu); | 5414 | src_nid = cpu_to_node(env->src_cpu); |
@@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6172 | * with a large weight task outweighs the tasks on the system). | 6352 | * with a large weight task outweighs the tasks on the system). |
6173 | */ | 6353 | */ |
6174 | if (prefer_sibling && sds->local && | 6354 | if (prefer_sibling && sds->local && |
6175 | sds->local_stat.group_has_free_capacity) | 6355 | sds->local_stat.group_has_free_capacity) { |
6176 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6356 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); |
6357 | sgs->group_type = group_classify(sg, sgs); | ||
6358 | } | ||
6177 | 6359 | ||
6178 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6360 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
6179 | sds->busiest = sg; | 6361 | sds->busiest = sg; |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 20bca398084a..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
1301 | struct task_struct *curr; | 1301 | struct task_struct *curr; |
1302 | struct rq *rq; | 1302 | struct rq *rq; |
1303 | 1303 | ||
1304 | if (p->nr_cpus_allowed == 1) | ||
1305 | goto out; | ||
1306 | |||
1307 | /* For anything but wake ups, just return the task_cpu */ | 1304 | /* For anything but wake ups, just return the task_cpu */ |
1308 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1305 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
1309 | goto out; | 1306 | goto out; |
@@ -1351,16 +1348,22 @@ out: | |||
1351 | 1348 | ||
1352 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1349 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
1353 | { | 1350 | { |
1354 | if (rq->curr->nr_cpus_allowed == 1) | 1351 | /* |
1352 | * Current can't be migrated, useless to reschedule, | ||
1353 | * let's hope p can move out. | ||
1354 | */ | ||
1355 | if (rq->curr->nr_cpus_allowed == 1 || | ||
1356 | !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
1355 | return; | 1357 | return; |
1356 | 1358 | ||
1359 | /* | ||
1360 | * p is migratable, so let's not schedule it and | ||
1361 | * see if it is pushed or pulled somewhere else. | ||
1362 | */ | ||
1357 | if (p->nr_cpus_allowed != 1 | 1363 | if (p->nr_cpus_allowed != 1 |
1358 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1364 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
1359 | return; | 1365 | return; |
1360 | 1366 | ||
1361 | if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
1362 | return; | ||
1363 | |||
1364 | /* | 1367 | /* |
1365 | * There appears to be other cpus that can accept | 1368 | * There appears to be other cpus that can accept |
1366 | * current and none to run 'p', so lets reschedule | 1369 | * current and none to run 'p', so lets reschedule |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2df8ef067cc5..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -176,6 +176,25 @@ struct dl_bw { | |||
176 | u64 bw, total_bw; | 176 | u64 bw, total_bw; |
177 | }; | 177 | }; |
178 | 178 | ||
179 | static inline | ||
180 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
181 | { | ||
182 | dl_b->total_bw -= tsk_bw; | ||
183 | } | ||
184 | |||
185 | static inline | ||
186 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
187 | { | ||
188 | dl_b->total_bw += tsk_bw; | ||
189 | } | ||
190 | |||
191 | static inline | ||
192 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
193 | { | ||
194 | return dl_b->bw != -1 && | ||
195 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
196 | } | ||
197 | |||
179 | extern struct mutex sched_domains_mutex; | 198 | extern struct mutex sched_domains_mutex; |
180 | 199 | ||
181 | #ifdef CONFIG_CGROUP_SCHED | 200 | #ifdef CONFIG_CGROUP_SCHED |
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
678 | return rq->clock_task; | 697 | return rq->clock_task; |
679 | } | 698 | } |
680 | 699 | ||
700 | #ifdef CONFIG_NUMA | ||
701 | enum numa_topology_type { | ||
702 | NUMA_DIRECT, | ||
703 | NUMA_GLUELESS_MESH, | ||
704 | NUMA_BACKPLANE, | ||
705 | }; | ||
706 | extern enum numa_topology_type sched_numa_topology_type; | ||
707 | extern int sched_max_numa_distance; | ||
708 | extern bool find_numa_distance(int distance); | ||
709 | #endif | ||
710 | |||
681 | #ifdef CONFIG_NUMA_BALANCING | 711 | #ifdef CONFIG_NUMA_BALANCING |
712 | /* The regions in numa_faults array from task_struct */ | ||
713 | enum numa_faults_stats { | ||
714 | NUMA_MEM = 0, | ||
715 | NUMA_CPU, | ||
716 | NUMA_MEMBUF, | ||
717 | NUMA_CPUBUF | ||
718 | }; | ||
682 | extern void sched_setnuma(struct task_struct *p, int node); | 719 | extern void sched_setnuma(struct task_struct *p, int node); |
683 | extern int migrate_task_to(struct task_struct *p, int cpu); | 720 | extern int migrate_task_to(struct task_struct *p, int cpu); |
684 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 721 | extern int migrate_swap(struct task_struct *, struct task_struct *); |
@@ -1127,6 +1164,11 @@ struct sched_class { | |||
1127 | void (*task_fork) (struct task_struct *p); | 1164 | void (*task_fork) (struct task_struct *p); |
1128 | void (*task_dead) (struct task_struct *p); | 1165 | void (*task_dead) (struct task_struct *p); |
1129 | 1166 | ||
1167 | /* | ||
1168 | * The switched_from() call is allowed to drop rq->lock, therefore we | ||
1169 | * cannot assume the switched_from/switched_to pair is serliazed by | ||
1170 | * rq->lock. They are however serialized by p->pi_lock. | ||
1171 | */ | ||
1130 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1172 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1131 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1173 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1132 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1174 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
@@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | |||
1504 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | 1546 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); |
1505 | extern void print_cfs_stats(struct seq_file *m, int cpu); | 1547 | extern void print_cfs_stats(struct seq_file *m, int cpu); |
1506 | extern void print_rt_stats(struct seq_file *m, int cpu); | 1548 | extern void print_rt_stats(struct seq_file *m, int cpu); |
1549 | extern void print_dl_stats(struct seq_file *m, int cpu); | ||
1507 | 1550 | ||
1508 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1551 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1509 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1552 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
12 | #include <linux/kthread.h> | ||
12 | 13 | ||
13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) | 14 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
14 | { | 15 | { |
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * | |||
297 | } | 298 | } |
298 | EXPORT_SYMBOL(autoremove_wake_function); | 299 | EXPORT_SYMBOL(autoremove_wake_function); |
299 | 300 | ||
301 | static inline bool is_kthread_should_stop(void) | ||
302 | { | ||
303 | return (current->flags & PF_KTHREAD) && kthread_should_stop(); | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * DEFINE_WAIT_FUNC(wait, woken_wake_func); | ||
308 | * | ||
309 | * add_wait_queue(&wq, &wait); | ||
310 | * for (;;) { | ||
311 | * if (condition) | ||
312 | * break; | ||
313 | * | ||
314 | * p->state = mode; condition = true; | ||
315 | * smp_mb(); // A smp_wmb(); // C | ||
316 | * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; | ||
317 | * schedule() try_to_wake_up(); | ||
318 | * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ | ||
319 | * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; | ||
320 | * smp_mb() // B smp_wmb(); // C | ||
321 | * wait->flags |= WQ_FLAG_WOKEN; | ||
322 | * } | ||
323 | * remove_wait_queue(&wq, &wait); | ||
324 | * | ||
325 | */ | ||
326 | long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | ||
327 | { | ||
328 | set_current_state(mode); /* A */ | ||
329 | /* | ||
330 | * The above implies an smp_mb(), which matches with the smp_wmb() from | ||
331 | * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must | ||
332 | * also observe all state before the wakeup. | ||
333 | */ | ||
334 | if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) | ||
335 | timeout = schedule_timeout(timeout); | ||
336 | __set_current_state(TASK_RUNNING); | ||
337 | |||
338 | /* | ||
339 | * The below implies an smp_mb(), it too pairs with the smp_wmb() from | ||
340 | * woken_wake_function() such that we must either observe the wait | ||
341 | * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss | ||
342 | * an event. | ||
343 | */ | ||
344 | set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ | ||
345 | |||
346 | return timeout; | ||
347 | } | ||
348 | EXPORT_SYMBOL(wait_woken); | ||
349 | |||
350 | int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
351 | { | ||
352 | /* | ||
353 | * Although this function is called under waitqueue lock, LOCK | ||
354 | * doesn't imply write barrier and the users expects write | ||
355 | * barrier semantics on wakeup functions. The following | ||
356 | * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() | ||
357 | * and is paired with set_mb() in wait_woken(). | ||
358 | */ | ||
359 | smp_wmb(); /* C */ | ||
360 | wait->flags |= WQ_FLAG_WOKEN; | ||
361 | |||
362 | return default_wake_function(wait, mode, sync, key); | ||
363 | } | ||
364 | EXPORT_SYMBOL(woken_wake_function); | ||
365 | |||
300 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) | 366 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) |
301 | { | 367 | { |
302 | struct wait_bit_key *key = arg; | 368 | struct wait_bit_key *key = arg; |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) | |||
110 | set_current_state(TASK_INTERRUPTIBLE); | 110 | set_current_state(TASK_INTERRUPTIBLE); |
111 | preempt_disable(); | 111 | preempt_disable(); |
112 | if (kthread_should_stop()) { | 112 | if (kthread_should_stop()) { |
113 | set_current_state(TASK_RUNNING); | 113 | __set_current_state(TASK_RUNNING); |
114 | preempt_enable(); | 114 | preempt_enable(); |
115 | if (ht->cleanup) | 115 | if (ht->cleanup) |
116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); |
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) | |||
136 | /* Check for state change setup */ | 136 | /* Check for state change setup */ |
137 | switch (td->status) { | 137 | switch (td->status) { |
138 | case HP_THREAD_NONE: | 138 | case HP_THREAD_NONE: |
139 | __set_current_state(TASK_RUNNING); | ||
139 | preempt_enable(); | 140 | preempt_enable(); |
140 | if (ht->setup) | 141 | if (ht->setup) |
141 | ht->setup(td->cpu); | 142 | ht->setup(td->cpu); |
142 | td->status = HP_THREAD_ACTIVE; | 143 | td->status = HP_THREAD_ACTIVE; |
143 | preempt_disable(); | 144 | continue; |
144 | break; | 145 | |
145 | case HP_THREAD_PARKED: | 146 | case HP_THREAD_PARKED: |
147 | __set_current_state(TASK_RUNNING); | ||
146 | preempt_enable(); | 148 | preempt_enable(); |
147 | if (ht->unpark) | 149 | if (ht->unpark) |
148 | ht->unpark(td->cpu); | 150 | ht->unpark(td->cpu); |
149 | td->status = HP_THREAD_ACTIVE; | 151 | td->status = HP_THREAD_ACTIVE; |
150 | preempt_disable(); | 152 | continue; |
151 | break; | ||
152 | } | 153 | } |
153 | 154 | ||
154 | if (!ht->thread_should_run(td->cpu)) { | 155 | if (!ht->thread_should_run(td->cpu)) { |
155 | preempt_enable(); | 156 | preempt_enable_no_resched(); |
156 | schedule(); | 157 | schedule(); |
157 | } else { | 158 | } else { |
158 | set_current_state(TASK_RUNNING); | 159 | __set_current_state(TASK_RUNNING); |
159 | preempt_enable(); | 160 | preempt_enable(); |
160 | ht->thread_fn(td->cpu); | 161 | ht->thread_fn(td->cpu); |
161 | } | 162 | } |