aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c11
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c241
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c99
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c354
-rw-r--r--kernel/sched/rt.c17
-rw-r--r--kernel/sched/sched.h43
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/smpboot.c15
16 files changed, 721 insertions, 213 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index cebb11db4d34..1f37f15117e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
499 set_freezable(); 499 set_freezable();
500 while (!kthread_should_stop()) { 500 while (!kthread_should_stop()) {
501 struct sk_buff *skb; 501 struct sk_buff *skb;
502 DECLARE_WAITQUEUE(wait, current);
503 502
504 flush_hold_queue(); 503 flush_hold_queue();
505 504
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
514 audit_printk_skb(skb); 513 audit_printk_skb(skb);
515 continue; 514 continue;
516 } 515 }
517 set_current_state(TASK_INTERRUPTIBLE);
518 add_wait_queue(&kauditd_wait, &wait);
519 516
520 if (!skb_queue_len(&audit_skb_queue)) { 517 wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
521 try_to_freeze();
522 schedule();
523 }
524
525 __set_current_state(TASK_RUNNING);
526 remove_wait_queue(&kauditd_wait, &wait);
527 } 518 }
528 return 0; 519 return 0;
529} 520}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..723cfc9d0ad7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
506 goto out; 506 goto out;
507 } 507 }
508 508
509 /*
510 * We can't shrink if we won't have enough room for SCHED_DEADLINE
511 * tasks.
512 */
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
509 ret = 0; 519 ret = 0;
510out: 520out:
511 rcu_read_unlock(); 521 rcu_read_unlock();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1429 goto out_unlock; 1439 goto out_unlock;
1430 1440
1431 cgroup_taskset_for_each(task, tset) { 1441 cgroup_taskset_for_each(task, tset) {
1432 /* 1442 ret = task_can_attach(task, cs->cpus_allowed);
1433 * Kthreads which disallow setaffinity shouldn't be moved 1443 if (ret)
1434 * to a new cpuset; we don't want to change their cpu
1435 * affinity and isolating such threads by their set of
1436 * allowed nodes is unnecessary. Thus, cpusets are not
1437 * applicable for such threads. This prevents checking for
1438 * success of set_cpus_allowed_ptr() on all attached tasks
1439 * before cpus_allowed may be changed.
1440 */
1441 ret = -EINVAL;
1442 if (task->flags & PF_NO_SETAFFINITY)
1443 goto out_unlock; 1444 goto out_unlock;
1444 ret = security_task_setscheduler(task); 1445 ret = security_task_setscheduler(task);
1445 if (ret) 1446 if (ret)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..232c4bc8bcc9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
997 997
998 get_task_struct(p); 998 get_task_struct(p);
999 read_unlock(&tasklist_lock); 999 read_unlock(&tasklist_lock);
1000 sched_annotate_sleep();
1001
1000 if ((exit_code & 0x7f) == 0) { 1002 if ((exit_code & 0x7f) == 0) {
1001 why = CLD_EXITED; 1003 why = CLD_EXITED;
1002 status = exit_code >> 8; 1004 status = exit_code >> 8;
@@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1079 * thread can reap it because we its state == DEAD/TRACE. 1081 * thread can reap it because we its state == DEAD/TRACE.
1080 */ 1082 */
1081 read_unlock(&tasklist_lock); 1083 read_unlock(&tasklist_lock);
1084 sched_annotate_sleep();
1082 1085
1083 retval = wo->wo_rusage 1086 retval = wo->wo_rusage
1084 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1087 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
@@ -1210,6 +1213,7 @@ unlock_sig:
1210 pid = task_pid_vnr(p); 1213 pid = task_pid_vnr(p);
1211 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1214 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1212 read_unlock(&tasklist_lock); 1215 read_unlock(&tasklist_lock);
1216 sched_annotate_sleep();
1213 1217
1214 if (unlikely(wo->wo_flags & WNOWAIT)) 1218 if (unlikely(wo->wo_flags & WNOWAIT))
1215 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1219 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1272 pid = task_pid_vnr(p); 1276 pid = task_pid_vnr(p);
1273 get_task_struct(p); 1277 get_task_struct(p);
1274 read_unlock(&tasklist_lock); 1278 read_unlock(&tasklist_lock);
1279 sched_annotate_sleep();
1275 1280
1276 if (!wo->wo_info) { 1281 if (!wo->wo_info) {
1277 retval = wo->wo_rusage 1282 retval = wo->wo_rusage
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
378 * reschedule now, before we try-lock the mutex. This avoids getting 378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex. 379 * scheduled out right after we obtained the mutex.
380 */ 380 */
381 if (need_resched()) 381 if (need_resched()) {
382 /*
383 * We _should_ have TASK_RUNNING here, but just in case
384 * we do not, make it so, otherwise we might get stuck.
385 */
386 __set_current_state(TASK_RUNNING);
382 schedule_preempt_disabled(); 387 schedule_preempt_disabled();
388 }
383 389
384 return false; 390 return false;
385} 391}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..e52a8739361a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3097,6 +3097,32 @@ static int may_init_module(void)
3097} 3097}
3098 3098
3099/* 3099/*
3100 * Can't use wait_event_interruptible() because our condition
3101 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3102 */
3103static int wait_finished_loading(struct module *mod)
3104{
3105 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3106 int ret = 0;
3107
3108 add_wait_queue(&module_wq, &wait);
3109 for (;;) {
3110 if (finished_loading(mod->name))
3111 break;
3112
3113 if (signal_pending(current)) {
3114 ret = -ERESTARTSYS;
3115 break;
3116 }
3117
3118 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3119 }
3120 remove_wait_queue(&module_wq, &wait);
3121
3122 return ret;
3123}
3124
3125/*
3100 * We try to place it in the list now to make sure it's unique before 3126 * We try to place it in the list now to make sure it's unique before
3101 * we dedicate too many resources. In particular, temporary percpu 3127 * we dedicate too many resources. In particular, temporary percpu
3102 * memory exhaustion. 3128 * memory exhaustion.
@@ -3116,8 +3142,8 @@ again:
3116 || old->state == MODULE_STATE_UNFORMED) { 3142 || old->state == MODULE_STATE_UNFORMED) {
3117 /* Wait in case it fails to load. */ 3143 /* Wait in case it fails to load. */
3118 mutex_unlock(&module_mutex); 3144 mutex_unlock(&module_mutex);
3119 err = wait_event_interruptible(module_wq, 3145
3120 finished_loading(mod->name)); 3146 err = wait_finished_loading(mod);
3121 if (err) 3147 if (err)
3122 goto out_unlocked; 3148 goto out_unlocked;
3123 goto again; 3149 goto again;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
148 * 148 *
149 * This waits to be signaled for completion of a specific task. It is NOT 149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting 150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO. 151 * for IO (which traditionally means blkio only).
152 */ 152 */
153void __sched wait_for_completion_io(struct completion *x) 153void __sched wait_for_completion_io(struct completion *x)
154{ 154{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
163 * 163 *
164 * This waits for either a completion of a specific task to be signaled or for a 164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not 165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO. 166 * interruptible. The caller is accounted as waiting for IO (which traditionally
167 * means blkio only).
167 * 168 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 169 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed. 170 * till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2826,15 +2813,8 @@ need_resched:
2826 rq->curr = next; 2813 rq->curr = next;
2827 ++*switch_count; 2814 ++*switch_count;
2828 2815
2829 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2830 /* 2817 cpu = cpu_of(rq);
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 } else 2818 } else
2839 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2840 2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653#endif 4633#endif
4654} 4634}
4655 4635
4636int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4637 const struct cpumask *trial)
4638{
4639 int ret = 1, trial_cpus;
4640 struct dl_bw *cur_dl_b;
4641 unsigned long flags;
4642
4643 rcu_read_lock_sched();
4644 cur_dl_b = dl_bw_of(cpumask_any(cur));
4645 trial_cpus = cpumask_weight(trial);
4646
4647 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4648 if (cur_dl_b->bw != -1 &&
4649 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4650 ret = 0;
4651 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4652 rcu_read_unlock_sched();
4653
4654 return ret;
4655}
4656
4657int task_can_attach(struct task_struct *p,
4658 const struct cpumask *cs_cpus_allowed)
4659{
4660 int ret = 0;
4661
4662 /*
4663 * Kthreads which disallow setaffinity shouldn't be moved
4664 * to a new cpuset; we don't want to change their cpu
4665 * affinity and isolating such threads by their set of
4666 * allowed nodes is unnecessary. Thus, cpusets are not
4667 * applicable for such threads. This prevents checking for
4668 * success of set_cpus_allowed_ptr() on all attached tasks
4669 * before cpus_allowed may be changed.
4670 */
4671 if (p->flags & PF_NO_SETAFFINITY) {
4672 ret = -EINVAL;
4673 goto out;
4674 }
4675
4676#ifdef CONFIG_SMP
4677 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4678 cs_cpus_allowed)) {
4679 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4680 cs_cpus_allowed);
4681 struct dl_bw *dl_b;
4682 bool overflow;
4683 int cpus;
4684 unsigned long flags;
4685
4686 rcu_read_lock_sched();
4687 dl_b = dl_bw_of(dest_cpu);
4688 raw_spin_lock_irqsave(&dl_b->lock, flags);
4689 cpus = dl_bw_cpus(dest_cpu);
4690 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4691 if (overflow)
4692 ret = -EBUSY;
4693 else {
4694 /*
4695 * We reserve space for this task in the destination
4696 * root_domain, as we can't fail after this point.
4697 * We will free resources in the source root_domain
4698 * later on (see set_cpus_allowed_dl()).
4699 */
4700 __dl_add(dl_b, p->dl.dl_bw);
4701 }
4702 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4703 rcu_read_unlock_sched();
4704
4705 }
4706#endif
4707out:
4708 return ret;
4709}
4710
4656#ifdef CONFIG_SMP 4711#ifdef CONFIG_SMP
4657/* 4712/*
4658 * move_queued_task - move a queued task to new rq. 4713 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103 6158
6104#ifdef CONFIG_NUMA 6159#ifdef CONFIG_NUMA
6105static int sched_domains_numa_levels; 6160static int sched_domains_numa_levels;
6161enum numa_topology_type sched_numa_topology_type;
6106static int *sched_domains_numa_distance; 6162static int *sched_domains_numa_distance;
6163int sched_max_numa_distance;
6107static struct cpumask ***sched_domains_numa_masks; 6164static struct cpumask ***sched_domains_numa_masks;
6108static int sched_domains_curr_level; 6165static int sched_domains_curr_level;
6109#endif 6166#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275 printk(KERN_WARNING "\n"); 6332 printk(KERN_WARNING "\n");
6276} 6333}
6277 6334
6278static bool find_numa_distance(int distance) 6335bool find_numa_distance(int distance)
6279{ 6336{
6280 int i; 6337 int i;
6281 6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290 return false; 6347 return false;
6291} 6348}
6292 6349
6350/*
6351 * A system can have three types of NUMA topology:
6352 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6353 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6354 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6355 *
6356 * The difference between a glueless mesh topology and a backplane
6357 * topology lies in whether communication between not directly
6358 * connected nodes goes through intermediary nodes (where programs
6359 * could run), or through backplane controllers. This affects
6360 * placement of programs.
6361 *
6362 * The type of topology can be discerned with the following tests:
6363 * - If the maximum distance between any nodes is 1 hop, the system
6364 * is directly connected.
6365 * - If for two nodes A and B, located N > 1 hops away from each other,
6366 * there is an intermediary node C, which is < N hops away from both
6367 * nodes A and B, the system is a glueless mesh.
6368 */
6369static void init_numa_topology_type(void)
6370{
6371 int a, b, c, n;
6372
6373 n = sched_max_numa_distance;
6374
6375 if (n <= 1)
6376 sched_numa_topology_type = NUMA_DIRECT;
6377
6378 for_each_online_node(a) {
6379 for_each_online_node(b) {
6380 /* Find two nodes furthest removed from each other. */
6381 if (node_distance(a, b) < n)
6382 continue;
6383
6384 /* Is there an intermediary node between a and b? */
6385 for_each_online_node(c) {
6386 if (node_distance(a, c) < n &&
6387 node_distance(b, c) < n) {
6388 sched_numa_topology_type =
6389 NUMA_GLUELESS_MESH;
6390 return;
6391 }
6392 }
6393
6394 sched_numa_topology_type = NUMA_BACKPLANE;
6395 return;
6396 }
6397 }
6398}
6399
6293static void sched_init_numa(void) 6400static void sched_init_numa(void)
6294{ 6401{
6295 int next_distance, curr_distance = node_distance(0, 0); 6402 int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426 sched_domain_topology = tl; 6533 sched_domain_topology = tl;
6427 6534
6428 sched_domains_numa_levels = level; 6535 sched_domains_numa_levels = level;
6536 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6537
6538 init_numa_topology_type();
6429} 6539}
6430 6540
6431static void sched_domains_numa_masks_set(int cpu) 6541static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178 7288
7179void __might_sleep(const char *file, int line, int preempt_offset) 7289void __might_sleep(const char *file, int line, int preempt_offset)
7180{ 7290{
7291 /*
7292 * Blocking primitives will set (and therefore destroy) current->state,
7293 * since we will exit with TASK_RUNNING make sure we enter with it,
7294 * otherwise we will destroy state.
7295 */
7296 if (WARN_ONCE(current->state != TASK_RUNNING,
7297 "do not call blocking ops when !TASK_RUNNING; "
7298 "state=%lx set at [<%p>] %pS\n",
7299 current->state,
7300 (void *)current->task_state_change,
7301 (void *)current->task_state_change))
7302 __set_current_state(TASK_RUNNING);
7303
7304 ___might_sleep(file, line, preempt_offset);
7305}
7306EXPORT_SYMBOL(__might_sleep);
7307
7308void ___might_sleep(const char *file, int line, int preempt_offset)
7309{
7181 static unsigned long prev_jiffy; /* ratelimiting */ 7310 static unsigned long prev_jiffy; /* ratelimiting */
7182 7311
7183 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7312 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209#endif 7338#endif
7210 dump_stack(); 7339 dump_stack();
7211} 7340}
7212EXPORT_SYMBOL(__might_sleep); 7341EXPORT_SYMBOL(___might_sleep);
7213#endif 7342#endif
7214 7343
7215#ifdef CONFIG_MAGIC_SYSRQ 7344#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
32 29
33#endif /* _LINUX_CPUDL_H */ 30#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
26void cpupri_set(struct cpupri *cp, int cpu, int pri); 26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp); 27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp); 28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif 29#endif
33 30
34#endif /* _LINUX_CPUPRI_H */ 31#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 28fa9d9e9201..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
563{ 563{
564 struct hrtimer *timer = &dl_se->dl_timer; 564 struct hrtimer *timer = &dl_se->dl_timer;
565 565
566 if (hrtimer_active(timer)) {
567 hrtimer_try_to_cancel(timer);
568 return;
569 }
570
571 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 566 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
572 timer->function = dl_task_timer; 567 timer->function = dl_task_timer;
573} 568}
@@ -633,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
633 628
634 sched_rt_avg_update(rq, delta_exec); 629 sched_rt_avg_update(rq, delta_exec);
635 630
636 dl_se->runtime -= delta_exec; 631 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
637 if (dl_runtime_exceeded(rq, dl_se)) { 632 if (dl_runtime_exceeded(rq, dl_se)) {
638 __dequeue_task_dl(rq, curr, 0); 633 __dequeue_task_dl(rq, curr, 0);
639 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 634 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -933,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
933 struct task_struct *curr; 928 struct task_struct *curr;
934 struct rq *rq; 929 struct rq *rq;
935 930
936 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 931 if (sd_flag != SD_BALANCE_WAKE)
937 goto out; 932 goto out;
938 933
939 rq = cpu_rq(cpu); 934 rq = cpu_rq(cpu);
@@ -1018,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{ 1013{
1019 hrtick_start(rq, p->dl.runtime); 1014 hrtick_start(rq, p->dl.runtime);
1020} 1015}
1016#else /* !CONFIG_SCHED_HRTICK */
1017static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{
1019}
1021#endif 1020#endif
1022 1021
1023static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 1022static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1071,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1071 /* Running task will never be pushed. */ 1070 /* Running task will never be pushed. */
1072 dequeue_pushable_dl_task(rq, p); 1071 dequeue_pushable_dl_task(rq, p);
1073 1072
1074#ifdef CONFIG_SCHED_HRTICK
1075 if (hrtick_enabled(rq)) 1073 if (hrtick_enabled(rq))
1076 start_hrtick_dl(rq, p); 1074 start_hrtick_dl(rq, p);
1077#endif
1078 1075
1079 set_post_schedule(rq); 1076 set_post_schedule(rq);
1080 1077
@@ -1093,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1093{ 1090{
1094 update_curr_dl(rq); 1091 update_curr_dl(rq);
1095 1092
1096#ifdef CONFIG_SCHED_HRTICK
1097 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1093 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1098 start_hrtick_dl(rq, p); 1094 start_hrtick_dl(rq, p);
1099#endif
1100} 1095}
1101 1096
1102static void task_fork_dl(struct task_struct *p) 1097static void task_fork_dl(struct task_struct *p)
@@ -1333,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
1333{ 1328{
1334 struct task_struct *next_task; 1329 struct task_struct *next_task;
1335 struct rq *later_rq; 1330 struct rq *later_rq;
1331 int ret = 0;
1336 1332
1337 if (!rq->dl.overloaded) 1333 if (!rq->dl.overloaded)
1338 return 0; 1334 return 0;
@@ -1378,7 +1374,6 @@ retry:
1378 * The task is still there. We don't try 1374 * The task is still there. We don't try
1379 * again, some other cpu will pull it when ready. 1375 * again, some other cpu will pull it when ready.
1380 */ 1376 */
1381 dequeue_pushable_dl_task(rq, next_task);
1382 goto out; 1377 goto out;
1383 } 1378 }
1384 1379
@@ -1394,6 +1389,7 @@ retry:
1394 deactivate_task(rq, next_task, 0); 1389 deactivate_task(rq, next_task, 0);
1395 set_task_cpu(next_task, later_rq->cpu); 1390 set_task_cpu(next_task, later_rq->cpu);
1396 activate_task(later_rq, next_task, 0); 1391 activate_task(later_rq, next_task, 0);
1392 ret = 1;
1397 1393
1398 resched_curr(later_rq); 1394 resched_curr(later_rq);
1399 1395
@@ -1402,7 +1398,7 @@ retry:
1402out: 1398out:
1403 put_task_struct(next_task); 1399 put_task_struct(next_task);
1404 1400
1405 return 1; 1401 return ret;
1406} 1402}
1407 1403
1408static void push_dl_tasks(struct rq *rq) 1404static void push_dl_tasks(struct rq *rq)
@@ -1508,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1508 p->nr_cpus_allowed > 1 && 1504 p->nr_cpus_allowed > 1 &&
1509 dl_task(rq->curr) && 1505 dl_task(rq->curr) &&
1510 (rq->curr->nr_cpus_allowed < 2 || 1506 (rq->curr->nr_cpus_allowed < 2 ||
1511 dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1507 !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
1512 push_dl_tasks(rq); 1508 push_dl_tasks(rq);
1513 } 1509 }
1514} 1510}
@@ -1517,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1517 const struct cpumask *new_mask) 1513 const struct cpumask *new_mask)
1518{ 1514{
1519 struct rq *rq; 1515 struct rq *rq;
1516 struct root_domain *src_rd;
1520 int weight; 1517 int weight;
1521 1518
1522 BUG_ON(!dl_task(p)); 1519 BUG_ON(!dl_task(p));
1523 1520
1521 rq = task_rq(p);
1522 src_rd = rq->rd;
1523 /*
1524 * Migrating a SCHED_DEADLINE task between exclusive
1525 * cpusets (different root_domains) entails a bandwidth
1526 * update. We already made space for us in the destination
1527 * domain (see cpuset_can_attach()).
1528 */
1529 if (!cpumask_intersects(src_rd->span, new_mask)) {
1530 struct dl_bw *src_dl_b;
1531
1532 src_dl_b = dl_bw_of(cpu_of(rq));
1533 /*
1534 * We now free resources of the root_domain we are migrating
1535 * off. In the worst case, sched_setattr() may temporary fail
1536 * until we complete the update.
1537 */
1538 raw_spin_lock(&src_dl_b->lock);
1539 __dl_clear(src_dl_b, p->dl.dl_bw);
1540 raw_spin_unlock(&src_dl_b->lock);
1541 }
1542
1524 /* 1543 /*
1525 * Update only if the task is actually running (i.e., 1544 * Update only if the task is actually running (i.e.,
1526 * it is on the rq AND it is not throttled). 1545 * it is on the rq AND it is not throttled).
@@ -1537,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1537 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1556 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1538 return; 1557 return;
1539 1558
1540 rq = task_rq(p);
1541
1542 /* 1559 /*
1543 * The process used to be able to migrate OR it can now migrate 1560 * The process used to be able to migrate OR it can now migrate
1544 */ 1561 */
@@ -1586,22 +1603,48 @@ void init_sched_dl_class(void)
1586 1603
1587#endif /* CONFIG_SMP */ 1604#endif /* CONFIG_SMP */
1588 1605
1606/*
1607 * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
1608 */
1609static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1610{
1611 struct hrtimer *dl_timer = &p->dl.dl_timer;
1612
1613 /* Nobody will change task's class if pi_lock is held */
1614 lockdep_assert_held(&p->pi_lock);
1615
1616 if (hrtimer_active(dl_timer)) {
1617 int ret = hrtimer_try_to_cancel(dl_timer);
1618
1619 if (unlikely(ret == -1)) {
1620 /*
1621 * Note, p may migrate OR new deadline tasks
1622 * may appear in rq when we are unlocking it.
1623 * A caller of us must be fine with that.
1624 */
1625 raw_spin_unlock(&rq->lock);
1626 hrtimer_cancel(dl_timer);
1627 raw_spin_lock(&rq->lock);
1628 }
1629 }
1630}
1631
1589static void switched_from_dl(struct rq *rq, struct task_struct *p) 1632static void switched_from_dl(struct rq *rq, struct task_struct *p)
1590{ 1633{
1591 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1634 cancel_dl_timer(rq, p);
1592 hrtimer_try_to_cancel(&p->dl.dl_timer);
1593 1635
1594 __dl_clear_params(p); 1636 __dl_clear_params(p);
1595 1637
1596#ifdef CONFIG_SMP
1597 /* 1638 /*
1598 * Since this might be the only -deadline task on the rq, 1639 * Since this might be the only -deadline task on the rq,
1599 * this is the right place to try to pull some other one 1640 * this is the right place to try to pull some other one
1600 * from an overloaded cpu, if any. 1641 * from an overloaded cpu, if any.
1601 */ 1642 */
1602 if (!rq->dl.dl_nr_running) 1643 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
1603 pull_dl_task(rq); 1644 return;
1604#endif 1645
1646 if (pull_dl_task(rq))
1647 resched_curr(rq);
1605} 1648}
1606 1649
1607/* 1650/*
@@ -1622,7 +1665,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1622 1665
1623 if (task_on_rq_queued(p) && rq->curr != p) { 1666 if (task_on_rq_queued(p) && rq->curr != p) {
1624#ifdef CONFIG_SMP 1667#ifdef CONFIG_SMP
1625 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1668 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
1669 push_dl_task(rq) && rq != task_rq(p))
1626 /* Only reschedule if pushing failed */ 1670 /* Only reschedule if pushing failed */
1627 check_resched = 0; 1671 check_resched = 0;
1628#endif /* CONFIG_SMP */ 1672#endif /* CONFIG_SMP */
@@ -1704,3 +1748,12 @@ const struct sched_class dl_sched_class = {
1704 1748
1705 .update_curr = update_curr_dl, 1749 .update_curr = update_curr_dl,
1706}; 1750};
1751
1752#ifdef CONFIG_SCHED_DEBUG
1753extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1754
1755void print_dl_stats(struct seq_file *m, int cpu)
1756{
1757 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
1758}
1759#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
261#undef P 261#undef P
262} 262}
263 263
264void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
265{
266 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
267 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
268}
269
264extern __read_mostly int sched_clock_running; 270extern __read_mostly int sched_clock_running;
265 271
266static void print_cpu(struct seq_file *m, int cpu) 272static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
329 spin_lock_irqsave(&sched_debug_lock, flags); 335 spin_lock_irqsave(&sched_debug_lock, flags);
330 print_cfs_stats(m, cpu); 336 print_cfs_stats(m, cpu);
331 print_rt_stats(m, cpu); 337 print_rt_stats(m, cpu);
338 print_dl_stats(m, cpu);
332 339
333 print_rq(m, rq, cpu); 340 print_rq(m, rq, cpu);
334 spin_unlock_irqrestore(&sched_debug_lock, flags); 341 spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
528 unsigned long nr_faults = -1; 535 unsigned long nr_faults = -1;
529 int cpu_current, home_node; 536 int cpu_current, home_node;
530 537
531 if (p->numa_faults_memory) 538 if (p->numa_faults)
532 nr_faults = p->numa_faults_memory[2*node + i]; 539 nr_faults = p->numa_faults[2*node + i];
533 540
534 cpu_current = !i ? (task_node(p) == node) : 541 cpu_current = !i ? (task_node(p) == node) :
535 (pol && node_isset(node, pol->v.nodes)); 542 (pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104b254c..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,7 +873,6 @@ struct numa_group {
873 spinlock_t lock; /* nr_tasks, tasks */ 873 spinlock_t lock; /* nr_tasks, tasks */
874 int nr_tasks; 874 int nr_tasks;
875 pid_t gid; 875 pid_t gid;
876 struct list_head task_list;
877 876
878 struct rcu_head rcu; 877 struct rcu_head rcu;
879 nodemask_t active_nodes; 878 nodemask_t active_nodes;
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
901 return p->numa_group ? p->numa_group->gid : 0; 900 return p->numa_group ? p->numa_group->gid : 0;
902} 901}
903 902
904static inline int task_faults_idx(int nid, int priv) 903/*
904 * The averaged statistics, shared & private, memory & cpu,
905 * occupy the first half of the array. The second half of the
906 * array is for current counters, which are averaged into the
907 * first set by task_numa_placement.
908 */
909static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
905{ 910{
906 return NR_NUMA_HINT_FAULT_TYPES * nid + priv; 911 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
907} 912}
908 913
909static inline unsigned long task_faults(struct task_struct *p, int nid) 914static inline unsigned long task_faults(struct task_struct *p, int nid)
910{ 915{
911 if (!p->numa_faults_memory) 916 if (!p->numa_faults)
912 return 0; 917 return 0;
913 918
914 return p->numa_faults_memory[task_faults_idx(nid, 0)] + 919 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
915 p->numa_faults_memory[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
916} 921}
917 922
918static inline unsigned long group_faults(struct task_struct *p, int nid) 923static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
920 if (!p->numa_group) 925 if (!p->numa_group)
921 return 0; 926 return 0;
922 927
923 return p->numa_group->faults[task_faults_idx(nid, 0)] + 928 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
924 p->numa_group->faults[task_faults_idx(nid, 1)]; 929 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
925} 930}
926 931
927static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 932static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
928{ 933{
929 return group->faults_cpu[task_faults_idx(nid, 0)] + 934 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
930 group->faults_cpu[task_faults_idx(nid, 1)]; 935 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
936}
937
938/* Handle placement on systems where not all nodes are directly connected. */
939static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
940 int maxdist, bool task)
941{
942 unsigned long score = 0;
943 int node;
944
945 /*
946 * All nodes are directly connected, and the same distance
947 * from each other. No need for fancy placement algorithms.
948 */
949 if (sched_numa_topology_type == NUMA_DIRECT)
950 return 0;
951
952 /*
953 * This code is called for each node, introducing N^2 complexity,
954 * which should be ok given the number of nodes rarely exceeds 8.
955 */
956 for_each_online_node(node) {
957 unsigned long faults;
958 int dist = node_distance(nid, node);
959
960 /*
961 * The furthest away nodes in the system are not interesting
962 * for placement; nid was already counted.
963 */
964 if (dist == sched_max_numa_distance || node == nid)
965 continue;
966
967 /*
968 * On systems with a backplane NUMA topology, compare groups
969 * of nodes, and move tasks towards the group with the most
970 * memory accesses. When comparing two nodes at distance
971 * "hoplimit", only nodes closer by than "hoplimit" are part
972 * of each group. Skip other nodes.
973 */
974 if (sched_numa_topology_type == NUMA_BACKPLANE &&
975 dist > maxdist)
976 continue;
977
978 /* Add up the faults from nearby nodes. */
979 if (task)
980 faults = task_faults(p, node);
981 else
982 faults = group_faults(p, node);
983
984 /*
985 * On systems with a glueless mesh NUMA topology, there are
986 * no fixed "groups of nodes". Instead, nodes that are not
987 * directly connected bounce traffic through intermediate
988 * nodes; a numa_group can occupy any set of nodes.
989 * The further away a node is, the less the faults count.
990 * This seems to result in good task placement.
991 */
992 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
993 faults *= (sched_max_numa_distance - dist);
994 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
995 }
996
997 score += faults;
998 }
999
1000 return score;
931} 1001}
932 1002
933/* 1003/*
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
936 * larger multiplier, in order to group tasks together that are almost 1006 * larger multiplier, in order to group tasks together that are almost
937 * evenly spread out between numa nodes. 1007 * evenly spread out between numa nodes.
938 */ 1008 */
939static inline unsigned long task_weight(struct task_struct *p, int nid) 1009static inline unsigned long task_weight(struct task_struct *p, int nid,
1010 int dist)
940{ 1011{
941 unsigned long total_faults; 1012 unsigned long faults, total_faults;
942 1013
943 if (!p->numa_faults_memory) 1014 if (!p->numa_faults)
944 return 0; 1015 return 0;
945 1016
946 total_faults = p->total_numa_faults; 1017 total_faults = p->total_numa_faults;
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
948 if (!total_faults) 1019 if (!total_faults)
949 return 0; 1020 return 0;
950 1021
951 return 1000 * task_faults(p, nid) / total_faults; 1022 faults = task_faults(p, nid);
1023 faults += score_nearby_nodes(p, nid, dist, true);
1024
1025 return 1000 * faults / total_faults;
952} 1026}
953 1027
954static inline unsigned long group_weight(struct task_struct *p, int nid) 1028static inline unsigned long group_weight(struct task_struct *p, int nid,
1029 int dist)
955{ 1030{
956 if (!p->numa_group || !p->numa_group->total_faults) 1031 unsigned long faults, total_faults;
1032
1033 if (!p->numa_group)
957 return 0; 1034 return 0;
958 1035
959 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 1036 total_faults = p->numa_group->total_faults;
1037
1038 if (!total_faults)
1039 return 0;
1040
1041 faults = group_faults(p, nid);
1042 faults += score_nearby_nodes(p, nid, dist, false);
1043
1044 return 1000 * faults / total_faults;
960} 1045}
961 1046
962bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1047bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1089,6 +1174,7 @@ struct task_numa_env {
1089 struct numa_stats src_stats, dst_stats; 1174 struct numa_stats src_stats, dst_stats;
1090 1175
1091 int imbalance_pct; 1176 int imbalance_pct;
1177 int dist;
1092 1178
1093 struct task_struct *best_task; 1179 struct task_struct *best_task;
1094 long best_imp; 1180 long best_imp;
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env,
1168 long load; 1254 long load;
1169 long imp = env->p->numa_group ? groupimp : taskimp; 1255 long imp = env->p->numa_group ? groupimp : taskimp;
1170 long moveimp = imp; 1256 long moveimp = imp;
1257 int dist = env->dist;
1171 1258
1172 rcu_read_lock(); 1259 rcu_read_lock();
1173 1260
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
1208 * in any group then look only at task weights. 1295 * in any group then look only at task weights.
1209 */ 1296 */
1210 if (cur->numa_group == env->p->numa_group) { 1297 if (cur->numa_group == env->p->numa_group) {
1211 imp = taskimp + task_weight(cur, env->src_nid) - 1298 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1212 task_weight(cur, env->dst_nid); 1299 task_weight(cur, env->dst_nid, dist);
1213 /* 1300 /*
1214 * Add some hysteresis to prevent swapping the 1301 * Add some hysteresis to prevent swapping the
1215 * tasks within a group over tiny differences. 1302 * tasks within a group over tiny differences.
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
1223 * instead. 1310 * instead.
1224 */ 1311 */
1225 if (cur->numa_group) 1312 if (cur->numa_group)
1226 imp += group_weight(cur, env->src_nid) - 1313 imp += group_weight(cur, env->src_nid, dist) -
1227 group_weight(cur, env->dst_nid); 1314 group_weight(cur, env->dst_nid, dist);
1228 else 1315 else
1229 imp += task_weight(cur, env->src_nid) - 1316 imp += task_weight(cur, env->src_nid, dist) -
1230 task_weight(cur, env->dst_nid); 1317 task_weight(cur, env->dst_nid, dist);
1231 } 1318 }
1232 } 1319 }
1233 1320
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
1326 }; 1413 };
1327 struct sched_domain *sd; 1414 struct sched_domain *sd;
1328 unsigned long taskweight, groupweight; 1415 unsigned long taskweight, groupweight;
1329 int nid, ret; 1416 int nid, ret, dist;
1330 long taskimp, groupimp; 1417 long taskimp, groupimp;
1331 1418
1332 /* 1419 /*
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
1354 return -EINVAL; 1441 return -EINVAL;
1355 } 1442 }
1356 1443
1357 taskweight = task_weight(p, env.src_nid);
1358 groupweight = group_weight(p, env.src_nid);
1359 update_numa_stats(&env.src_stats, env.src_nid);
1360 env.dst_nid = p->numa_preferred_nid; 1444 env.dst_nid = p->numa_preferred_nid;
1361 taskimp = task_weight(p, env.dst_nid) - taskweight; 1445 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1362 groupimp = group_weight(p, env.dst_nid) - groupweight; 1446 taskweight = task_weight(p, env.src_nid, dist);
1447 groupweight = group_weight(p, env.src_nid, dist);
1448 update_numa_stats(&env.src_stats, env.src_nid);
1449 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1450 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1363 update_numa_stats(&env.dst_stats, env.dst_nid); 1451 update_numa_stats(&env.dst_stats, env.dst_nid);
1364 1452
1365 /* Try to find a spot on the preferred nid. */ 1453 /* Try to find a spot on the preferred nid. */
1366 task_numa_find_cpu(&env, taskimp, groupimp); 1454 task_numa_find_cpu(&env, taskimp, groupimp);
1367 1455
1368 /* No space available on the preferred nid. Look elsewhere. */ 1456 /*
1369 if (env.best_cpu == -1) { 1457 * Look at other nodes in these cases:
1458 * - there is no space available on the preferred_nid
1459 * - the task is part of a numa_group that is interleaved across
1460 * multiple NUMA nodes; in order to better consolidate the group,
1461 * we need to check other locations.
1462 */
1463 if (env.best_cpu == -1 || (p->numa_group &&
1464 nodes_weight(p->numa_group->active_nodes) > 1)) {
1370 for_each_online_node(nid) { 1465 for_each_online_node(nid) {
1371 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1466 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1372 continue; 1467 continue;
1373 1468
1469 dist = node_distance(env.src_nid, env.dst_nid);
1470 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1471 dist != env.dist) {
1472 taskweight = task_weight(p, env.src_nid, dist);
1473 groupweight = group_weight(p, env.src_nid, dist);
1474 }
1475
1374 /* Only consider nodes where both task and groups benefit */ 1476 /* Only consider nodes where both task and groups benefit */
1375 taskimp = task_weight(p, nid) - taskweight; 1477 taskimp = task_weight(p, nid, dist) - taskweight;
1376 groupimp = group_weight(p, nid) - groupweight; 1478 groupimp = group_weight(p, nid, dist) - groupweight;
1377 if (taskimp < 0 && groupimp < 0) 1479 if (taskimp < 0 && groupimp < 0)
1378 continue; 1480 continue;
1379 1481
1482 env.dist = dist;
1380 env.dst_nid = nid; 1483 env.dst_nid = nid;
1381 update_numa_stats(&env.dst_stats, env.dst_nid); 1484 update_numa_stats(&env.dst_stats, env.dst_nid);
1382 task_numa_find_cpu(&env, taskimp, groupimp); 1485 task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1431 unsigned long interval = HZ; 1534 unsigned long interval = HZ;
1432 1535
1433 /* This task has no NUMA fault statistics yet */ 1536 /* This task has no NUMA fault statistics yet */
1434 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1537 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1435 return; 1538 return;
1436 1539
1437 /* Periodically retry migrating the task to the preferred node */ 1540 /* Periodically retry migrating the task to the preferred node */
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1580 return delta; 1683 return delta;
1581} 1684}
1582 1685
1686/*
1687 * Determine the preferred nid for a task in a numa_group. This needs to
1688 * be done in a way that produces consistent results with group_weight,
1689 * otherwise workloads might not converge.
1690 */
1691static int preferred_group_nid(struct task_struct *p, int nid)
1692{
1693 nodemask_t nodes;
1694 int dist;
1695
1696 /* Direct connections between all NUMA nodes. */
1697 if (sched_numa_topology_type == NUMA_DIRECT)
1698 return nid;
1699
1700 /*
1701 * On a system with glueless mesh NUMA topology, group_weight
1702 * scores nodes according to the number of NUMA hinting faults on
1703 * both the node itself, and on nearby nodes.
1704 */
1705 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1706 unsigned long score, max_score = 0;
1707 int node, max_node = nid;
1708
1709 dist = sched_max_numa_distance;
1710
1711 for_each_online_node(node) {
1712 score = group_weight(p, node, dist);
1713 if (score > max_score) {
1714 max_score = score;
1715 max_node = node;
1716 }
1717 }
1718 return max_node;
1719 }
1720
1721 /*
1722 * Finding the preferred nid in a system with NUMA backplane
1723 * interconnect topology is more involved. The goal is to locate
1724 * tasks from numa_groups near each other in the system, and
1725 * untangle workloads from different sides of the system. This requires
1726 * searching down the hierarchy of node groups, recursively searching
1727 * inside the highest scoring group of nodes. The nodemask tricks
1728 * keep the complexity of the search down.
1729 */
1730 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0;
1733 nodemask_t max_group;
1734 int a, b;
1735
1736 /* Are there nodes at this distance from each other? */
1737 if (!find_numa_distance(dist))
1738 continue;
1739
1740 for_each_node_mask(a, nodes) {
1741 unsigned long faults = 0;
1742 nodemask_t this_group;
1743 nodes_clear(this_group);
1744
1745 /* Sum group's NUMA faults; includes a==b case. */
1746 for_each_node_mask(b, nodes) {
1747 if (node_distance(a, b) < dist) {
1748 faults += group_faults(p, b);
1749 node_set(b, this_group);
1750 node_clear(b, nodes);
1751 }
1752 }
1753
1754 /* Remember the top group. */
1755 if (faults > max_faults) {
1756 max_faults = faults;
1757 max_group = this_group;
1758 /*
1759 * subtle: at the smallest distance there is
1760 * just one node left in each "group", the
1761 * winner is the preferred nid.
1762 */
1763 nid = a;
1764 }
1765 }
1766 /* Next round, evaluate the nodes within max_group. */
1767 nodes = max_group;
1768 }
1769 return nid;
1770}
1771
1583static void task_numa_placement(struct task_struct *p) 1772static void task_numa_placement(struct task_struct *p)
1584{ 1773{
1585 int seq, nid, max_nid = -1, max_group_nid = -1; 1774 int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
1607 1796
1608 /* Find the node with the highest number of faults */ 1797 /* Find the node with the highest number of faults */
1609 for_each_online_node(nid) { 1798 for_each_online_node(nid) {
1799 /* Keep track of the offsets in numa_faults array */
1800 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1610 unsigned long faults = 0, group_faults = 0; 1801 unsigned long faults = 0, group_faults = 0;
1611 int priv, i; 1802 int priv;
1612 1803
1613 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 1804 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1614 long diff, f_diff, f_weight; 1805 long diff, f_diff, f_weight;
1615 1806
1616 i = task_faults_idx(nid, priv); 1807 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1808 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1809 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1810 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1617 1811
1618 /* Decay existing window, copy faults since last scan */ 1812 /* Decay existing window, copy faults since last scan */
1619 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1813 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1620 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1814 fault_types[priv] += p->numa_faults[membuf_idx];
1621 p->numa_faults_buffer_memory[i] = 0; 1815 p->numa_faults[membuf_idx] = 0;
1622 1816
1623 /* 1817 /*
1624 * Normalize the faults_from, so all tasks in a group 1818 * Normalize the faults_from, so all tasks in a group
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
1628 * faults are less important. 1822 * faults are less important.
1629 */ 1823 */
1630 f_weight = div64_u64(runtime << 16, period + 1); 1824 f_weight = div64_u64(runtime << 16, period + 1);
1631 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / 1825 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1632 (total_faults + 1); 1826 (total_faults + 1);
1633 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1827 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1634 p->numa_faults_buffer_cpu[i] = 0; 1828 p->numa_faults[cpubuf_idx] = 0;
1635 1829
1636 p->numa_faults_memory[i] += diff; 1830 p->numa_faults[mem_idx] += diff;
1637 p->numa_faults_cpu[i] += f_diff; 1831 p->numa_faults[cpu_idx] += f_diff;
1638 faults += p->numa_faults_memory[i]; 1832 faults += p->numa_faults[mem_idx];
1639 p->total_numa_faults += diff; 1833 p->total_numa_faults += diff;
1640 if (p->numa_group) { 1834 if (p->numa_group) {
1641 /* safe because we can only change our own group */ 1835 /*
1642 p->numa_group->faults[i] += diff; 1836 * safe because we can only change our own group
1643 p->numa_group->faults_cpu[i] += f_diff; 1837 *
1838 * mem_idx represents the offset for a given
1839 * nid and priv in a specific region because it
1840 * is at the beginning of the numa_faults array.
1841 */
1842 p->numa_group->faults[mem_idx] += diff;
1843 p->numa_group->faults_cpu[mem_idx] += f_diff;
1644 p->numa_group->total_faults += diff; 1844 p->numa_group->total_faults += diff;
1645 group_faults += p->numa_group->faults[i]; 1845 group_faults += p->numa_group->faults[mem_idx];
1646 } 1846 }
1647 } 1847 }
1648 1848
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
1662 if (p->numa_group) { 1862 if (p->numa_group) {
1663 update_numa_active_node_mask(p->numa_group); 1863 update_numa_active_node_mask(p->numa_group);
1664 spin_unlock_irq(group_lock); 1864 spin_unlock_irq(group_lock);
1665 max_nid = max_group_nid; 1865 max_nid = preferred_group_nid(p, max_group_nid);
1666 } 1866 }
1667 1867
1668 if (max_faults) { 1868 if (max_faults) {
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1705 1905
1706 atomic_set(&grp->refcount, 1); 1906 atomic_set(&grp->refcount, 1);
1707 spin_lock_init(&grp->lock); 1907 spin_lock_init(&grp->lock);
1708 INIT_LIST_HEAD(&grp->task_list);
1709 grp->gid = p->pid; 1908 grp->gid = p->pid;
1710 /* Second half of the array tracks nids where faults happen */ 1909 /* Second half of the array tracks nids where faults happen */
1711 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 1910 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1714 node_set(task_node(current), grp->active_nodes); 1913 node_set(task_node(current), grp->active_nodes);
1715 1914
1716 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1915 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1717 grp->faults[i] = p->numa_faults_memory[i]; 1916 grp->faults[i] = p->numa_faults[i];
1718 1917
1719 grp->total_faults = p->total_numa_faults; 1918 grp->total_faults = p->total_numa_faults;
1720 1919
1721 list_add(&p->numa_entry, &grp->task_list);
1722 grp->nr_tasks++; 1920 grp->nr_tasks++;
1723 rcu_assign_pointer(p->numa_group, grp); 1921 rcu_assign_pointer(p->numa_group, grp);
1724 } 1922 }
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1773 double_lock_irq(&my_grp->lock, &grp->lock); 1971 double_lock_irq(&my_grp->lock, &grp->lock);
1774 1972
1775 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1973 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1776 my_grp->faults[i] -= p->numa_faults_memory[i]; 1974 my_grp->faults[i] -= p->numa_faults[i];
1777 grp->faults[i] += p->numa_faults_memory[i]; 1975 grp->faults[i] += p->numa_faults[i];
1778 } 1976 }
1779 my_grp->total_faults -= p->total_numa_faults; 1977 my_grp->total_faults -= p->total_numa_faults;
1780 grp->total_faults += p->total_numa_faults; 1978 grp->total_faults += p->total_numa_faults;
1781 1979
1782 list_move(&p->numa_entry, &grp->task_list);
1783 my_grp->nr_tasks--; 1980 my_grp->nr_tasks--;
1784 grp->nr_tasks++; 1981 grp->nr_tasks++;
1785 1982
@@ -1799,27 +1996,23 @@ no_join:
1799void task_numa_free(struct task_struct *p) 1996void task_numa_free(struct task_struct *p)
1800{ 1997{
1801 struct numa_group *grp = p->numa_group; 1998 struct numa_group *grp = p->numa_group;
1802 void *numa_faults = p->numa_faults_memory; 1999 void *numa_faults = p->numa_faults;
1803 unsigned long flags; 2000 unsigned long flags;
1804 int i; 2001 int i;
1805 2002
1806 if (grp) { 2003 if (grp) {
1807 spin_lock_irqsave(&grp->lock, flags); 2004 spin_lock_irqsave(&grp->lock, flags);
1808 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2005 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1809 grp->faults[i] -= p->numa_faults_memory[i]; 2006 grp->faults[i] -= p->numa_faults[i];
1810 grp->total_faults -= p->total_numa_faults; 2007 grp->total_faults -= p->total_numa_faults;
1811 2008
1812 list_del(&p->numa_entry);
1813 grp->nr_tasks--; 2009 grp->nr_tasks--;
1814 spin_unlock_irqrestore(&grp->lock, flags); 2010 spin_unlock_irqrestore(&grp->lock, flags);
1815 RCU_INIT_POINTER(p->numa_group, NULL); 2011 RCU_INIT_POINTER(p->numa_group, NULL);
1816 put_numa_group(grp); 2012 put_numa_group(grp);
1817 } 2013 }
1818 2014
1819 p->numa_faults_memory = NULL; 2015 p->numa_faults = NULL;
1820 p->numa_faults_buffer_memory = NULL;
1821 p->numa_faults_cpu= NULL;
1822 p->numa_faults_buffer_cpu = NULL;
1823 kfree(numa_faults); 2016 kfree(numa_faults);
1824} 2017}
1825 2018
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1842 return; 2035 return;
1843 2036
1844 /* Allocate buffer to track faults on a per-node basis */ 2037 /* Allocate buffer to track faults on a per-node basis */
1845 if (unlikely(!p->numa_faults_memory)) { 2038 if (unlikely(!p->numa_faults)) {
1846 int size = sizeof(*p->numa_faults_memory) * 2039 int size = sizeof(*p->numa_faults) *
1847 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2040 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1848 2041
1849 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2042 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1850 if (!p->numa_faults_memory) 2043 if (!p->numa_faults)
1851 return; 2044 return;
1852 2045
1853 BUG_ON(p->numa_faults_buffer_memory);
1854 /*
1855 * The averaged statistics, shared & private, memory & cpu,
1856 * occupy the first half of the array. The second half of the
1857 * array is for current counters, which are averaged into the
1858 * first set by task_numa_placement.
1859 */
1860 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1861 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1862 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1863 p->total_numa_faults = 0; 2046 p->total_numa_faults = 0;
1864 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2047 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1865 } 2048 }
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1899 if (migrated) 2082 if (migrated)
1900 p->numa_pages_migrated += pages; 2083 p->numa_pages_migrated += pages;
1901 2084
1902 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 2085 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
1903 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 2086 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
1904 p->numa_faults_locality[local] += pages; 2087 p->numa_faults_locality[local] += pages;
1905} 2088}
1906 2089
@@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4469 latest_idle_timestamp = rq->idle_stamp; 4652 latest_idle_timestamp = rq->idle_stamp;
4470 shallowest_idle_cpu = i; 4653 shallowest_idle_cpu = i;
4471 } 4654 }
4472 } else { 4655 } else if (shallowest_idle_cpu == -1) {
4473 load = weighted_cpuload(i); 4656 load = weighted_cpuload(i);
4474 if (load < min_load || (load == min_load && i == this_cpu)) { 4657 if (load < min_load || (load == min_load && i == this_cpu)) {
4475 min_load = load; 4658 min_load = load;
@@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4547 int want_affine = 0; 4730 int want_affine = 0;
4548 int sync = wake_flags & WF_SYNC; 4731 int sync = wake_flags & WF_SYNC;
4549 4732
4550 if (p->nr_cpus_allowed == 1)
4551 return prev_cpu;
4552
4553 if (sd_flag & SD_BALANCE_WAKE) 4733 if (sd_flag & SD_BALANCE_WAKE)
4554 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 4734 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4555 4735
@@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5189 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5369 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5190 int src_nid, dst_nid; 5370 int src_nid, dst_nid;
5191 5371
5192 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5372 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5193 !(env->sd->flags & SD_NUMA)) { 5373 !(env->sd->flags & SD_NUMA)) {
5194 return false; 5374 return false;
5195 } 5375 }
@@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5228 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5408 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5229 return false; 5409 return false;
5230 5410
5231 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) 5411 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5232 return false; 5412 return false;
5233 5413
5234 src_nid = cpu_to_node(env->src_cpu); 5414 src_nid = cpu_to_node(env->src_cpu);
@@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6172 * with a large weight task outweighs the tasks on the system). 6352 * with a large weight task outweighs the tasks on the system).
6173 */ 6353 */
6174 if (prefer_sibling && sds->local && 6354 if (prefer_sibling && sds->local &&
6175 sds->local_stat.group_has_free_capacity) 6355 sds->local_stat.group_has_free_capacity) {
6176 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6356 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6357 sgs->group_type = group_classify(sg, sgs);
6358 }
6177 6359
6178 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6360 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6179 sds->busiest = sg; 6361 sds->busiest = sg;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1301 struct task_struct *curr; 1301 struct task_struct *curr;
1302 struct rq *rq; 1302 struct rq *rq;
1303 1303
1304 if (p->nr_cpus_allowed == 1)
1305 goto out;
1306
1307 /* For anything but wake ups, just return the task_cpu */ 1304 /* For anything but wake ups, just return the task_cpu */
1308 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1305 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1309 goto out; 1306 goto out;
@@ -1351,16 +1348,22 @@ out:
1351 1348
1352static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1349static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1353{ 1350{
1354 if (rq->curr->nr_cpus_allowed == 1) 1351 /*
1352 * Current can't be migrated, useless to reschedule,
1353 * let's hope p can move out.
1354 */
1355 if (rq->curr->nr_cpus_allowed == 1 ||
1356 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1355 return; 1357 return;
1356 1358
1359 /*
1360 * p is migratable, so let's not schedule it and
1361 * see if it is pushed or pulled somewhere else.
1362 */
1357 if (p->nr_cpus_allowed != 1 1363 if (p->nr_cpus_allowed != 1
1358 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1364 && cpupri_find(&rq->rd->cpupri, p, NULL))
1359 return; 1365 return;
1360 1366
1361 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1362 return;
1363
1364 /* 1367 /*
1365 * There appears to be other cpus that can accept 1368 * There appears to be other cpus that can accept
1366 * current and none to run 'p', so lets reschedule 1369 * current and none to run 'p', so lets reschedule
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef067cc5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
176 u64 bw, total_bw; 176 u64 bw, total_bw;
177}; 177};
178 178
179static inline
180void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
181{
182 dl_b->total_bw -= tsk_bw;
183}
184
185static inline
186void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
187{
188 dl_b->total_bw += tsk_bw;
189}
190
191static inline
192bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
193{
194 return dl_b->bw != -1 &&
195 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
196}
197
179extern struct mutex sched_domains_mutex; 198extern struct mutex sched_domains_mutex;
180 199
181#ifdef CONFIG_CGROUP_SCHED 200#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
678 return rq->clock_task; 697 return rq->clock_task;
679} 698}
680 699
700#ifdef CONFIG_NUMA
701enum numa_topology_type {
702 NUMA_DIRECT,
703 NUMA_GLUELESS_MESH,
704 NUMA_BACKPLANE,
705};
706extern enum numa_topology_type sched_numa_topology_type;
707extern int sched_max_numa_distance;
708extern bool find_numa_distance(int distance);
709#endif
710
681#ifdef CONFIG_NUMA_BALANCING 711#ifdef CONFIG_NUMA_BALANCING
712/* The regions in numa_faults array from task_struct */
713enum numa_faults_stats {
714 NUMA_MEM = 0,
715 NUMA_CPU,
716 NUMA_MEMBUF,
717 NUMA_CPUBUF
718};
682extern void sched_setnuma(struct task_struct *p, int node); 719extern void sched_setnuma(struct task_struct *p, int node);
683extern int migrate_task_to(struct task_struct *p, int cpu); 720extern int migrate_task_to(struct task_struct *p, int cpu);
684extern int migrate_swap(struct task_struct *, struct task_struct *); 721extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
1127 void (*task_fork) (struct task_struct *p); 1164 void (*task_fork) (struct task_struct *p);
1128 void (*task_dead) (struct task_struct *p); 1165 void (*task_dead) (struct task_struct *p);
1129 1166
1167 /*
1168 * The switched_from() call is allowed to drop rq->lock, therefore we
1169 * cannot assume the switched_from/switched_to pair is serliazed by
1170 * rq->lock. They are however serialized by p->pi_lock.
1171 */
1130 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1172 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1131 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1173 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1132 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1174 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1504extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1546extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1505extern void print_cfs_stats(struct seq_file *m, int cpu); 1547extern void print_cfs_stats(struct seq_file *m, int cpu);
1506extern void print_rt_stats(struct seq_file *m, int cpu); 1548extern void print_rt_stats(struct seq_file *m, int cpu);
1549extern void print_dl_stats(struct seq_file *m, int cpu);
1507 1550
1508extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1551extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1509extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1552extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12#include <linux/kthread.h>
12 13
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 14void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 15{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
297} 298}
298EXPORT_SYMBOL(autoremove_wake_function); 299EXPORT_SYMBOL(autoremove_wake_function);
299 300
301static inline bool is_kthread_should_stop(void)
302{
303 return (current->flags & PF_KTHREAD) && kthread_should_stop();
304}
305
306/*
307 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
308 *
309 * add_wait_queue(&wq, &wait);
310 * for (;;) {
311 * if (condition)
312 * break;
313 *
314 * p->state = mode; condition = true;
315 * smp_mb(); // A smp_wmb(); // C
316 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
317 * schedule() try_to_wake_up();
318 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
319 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
320 * smp_mb() // B smp_wmb(); // C
321 * wait->flags |= WQ_FLAG_WOKEN;
322 * }
323 * remove_wait_queue(&wq, &wait);
324 *
325 */
326long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
327{
328 set_current_state(mode); /* A */
329 /*
330 * The above implies an smp_mb(), which matches with the smp_wmb() from
331 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
332 * also observe all state before the wakeup.
333 */
334 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
335 timeout = schedule_timeout(timeout);
336 __set_current_state(TASK_RUNNING);
337
338 /*
339 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
340 * woken_wake_function() such that we must either observe the wait
341 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
342 * an event.
343 */
344 set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
345
346 return timeout;
347}
348EXPORT_SYMBOL(wait_woken);
349
350int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
351{
352 /*
353 * Although this function is called under waitqueue lock, LOCK
354 * doesn't imply write barrier and the users expects write
355 * barrier semantics on wakeup functions. The following
356 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
357 * and is paired with set_mb() in wait_woken().
358 */
359 smp_wmb(); /* C */
360 wait->flags |= WQ_FLAG_WOKEN;
361
362 return default_wake_function(wait, mode, sync, key);
363}
364EXPORT_SYMBOL(woken_wake_function);
365
300int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) 366int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
301{ 367{
302 struct wait_bit_key *key = arg; 368 struct wait_bit_key *key = arg;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
110 set_current_state(TASK_INTERRUPTIBLE); 110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable(); 111 preempt_disable();
112 if (kthread_should_stop()) { 112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING); 113 __set_current_state(TASK_RUNNING);
114 preempt_enable(); 114 preempt_enable();
115 if (ht->cleanup) 115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu)); 116 ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
138 case HP_THREAD_NONE: 138 case HP_THREAD_NONE:
139 __set_current_state(TASK_RUNNING);
139 preempt_enable(); 140 preempt_enable();
140 if (ht->setup) 141 if (ht->setup)
141 ht->setup(td->cpu); 142 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE; 143 td->status = HP_THREAD_ACTIVE;
143 preempt_disable(); 144 continue;
144 break; 145
145 case HP_THREAD_PARKED: 146 case HP_THREAD_PARKED:
147 __set_current_state(TASK_RUNNING);
146 preempt_enable(); 148 preempt_enable();
147 if (ht->unpark) 149 if (ht->unpark)
148 ht->unpark(td->cpu); 150 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE; 151 td->status = HP_THREAD_ACTIVE;
150 preempt_disable(); 152 continue;
151 break;
152 } 153 }
153 154
154 if (!ht->thread_should_run(td->cpu)) { 155 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable(); 156 preempt_enable_no_resched();
156 schedule(); 157 schedule();
157 } else { 158 } else {
158 set_current_state(TASK_RUNNING); 159 __set_current_state(TASK_RUNNING);
159 preempt_enable(); 160 preempt_enable();
160 ht->thread_fn(td->cpu); 161 ht->thread_fn(td->cpu);
161 } 162 }