Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main changes in this cycle are: - 'Nested Sleep Debugging', activated when CONFIG_DEBUG_ATOMIC_SLEEP=y. This instruments might_sleep() checks to catch places that nest blocking primitives - such as mutex usage in a wait loop. Such bugs can result in hard to debug races/hangs. Another category of invalid nesting that this facility will detect is the calling of blocking functions from within schedule() -> sched_submit_work() -> blk_schedule_flush_plug(). There's some potential for false positives (if secondary blocking primitives themselves are not ready yet for this facility), but the kernel will warn once about such bugs per bootup, so the warning isn't much of a nuisance. This feature comes with a number of fixes, for problems uncovered with it, so no messages are expected normally. - Another round of sched/numa optimizations and refinements, for CONFIG_NUMA_BALANCING=y. - Another round of sched/dl fixes and refinements. Plus various smaller fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched: Add missing rcu protection to wake_up_all_idle_cpus sched/deadline: Introduce start_hrtick_dl() for !CONFIG_SCHED_HRTICK sched/numa: Init numa balancing fields of init_task sched/deadline: Remove unnecessary definitions in cpudeadline.h sched/cpupri: Remove unnecessary definitions in cpupri.h sched/deadline: Fix rq->dl.pushable_tasks bug in push_dl_task() sched/fair: Fix stale overloaded status in the busiest group finding logic sched: Move p->nr_cpus_allowed check to select_task_rq() sched/completion: Document when to use wait_for_completion_io_*() sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC sched/fair: Kill task_struct::numa_entry and numa_group::task_list sched: Refactor task_struct to use numa_faults instead of numa_* pointers sched/deadline: Don't check CONFIG_SMP in switched_from_dl() sched/deadline: Reschedule from switched_from_dl() after a successful pull sched/deadline: Push task away if the deadline is equal to curr during wakeup sched/deadline: Add deadline rq status print sched/deadline: Fix artificial overrun introduced by yield_task_dl() sched/rt: Clean up check_preempt_equal_prio() sched/core: Use dl_bw_of() under rcu_read_lock_sched() sched: Check if we got a shallowest_idle_cpu before searching for least_loaded_cpu ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-10 00:21:34 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-10 00:21:34 -0500
commit: 86c6a2fddf0b89b494c7616f2c06cf915c4bff01 (patch)
tree: 0e6930c93e5d49ead71b17fcadf0cc9ba28c3d2d /kernel/sched/core.c
parent: bee2782f30f66898be3f74ad02e4d1f87a969694 (diff)
parent: fd7de1e8d5b2b2b35e71332fafb899f584597150 (diff)
1 files changed, 185 insertions, 56 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
+                /* Possble rq->lock 'hole'.  */
                p->sched_class->switched_to(rq, p);
        } else if (oldprio != p->prio || dl_task(p))
                p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_preempt_count(p) & PREEMPT_ACTIVE));
+                        !p->on_rq);
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1407,7 +1411,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-        cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+        if (p->nr_cpus_allowed > 1)
+                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        if (!is_idle_task(rq->curr))
+        rcu_read_lock();
-                return;
+        if (!is_idle_task(rcu_dereference(rq->curr)))
+                goto out;
        if (set_nr_if_polling(rq->idle)) {
                trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
                /* Else cpu is not in idle, do nothing here */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
+out:
+        rcu_read_unlock();
 }
 bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults_memory = NULL;
+        p->numa_faults = NULL;
-        p->numa_faults_buffer_memory = NULL;
        p->last_task_numa_placement = 0;
        p->last_sum_exec_runtime = 0;
-        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
 }
 #endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw -= tsk_bw;
-}
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw += tsk_bw;
-}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-        return dl_b->bw != -1 &&
-               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
 /*
 * We must be sure that accepting a new task (or allowing changing the
 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 /**
 * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
 */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
        __releases(rq->lock)
 {
+        struct rq *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        }
        tick_nohz_task_switch(current);
+        return rq;
 }
 #ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
-        struct rq *rq = this_rq();
+        struct rq *rq;
-        finish_task_switch(rq, prev);
-        /*
+        /* finish_task_switch() drops rq->lock and enables preemtion */
-         * FIXME: do we need to worry about rq being invalidated by the
+        preempt_disable();
-         * task_switch?
+        rq = finish_task_switch(prev);
-         */
        post_schedule(rq);
+        preempt_enable();
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
- * context_switch - switch to the new MM and the new
+ * context_switch - switch to the new MM and the new thread's register state.
- * thread's register state.
 */
-static inline void
+static inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next)
 {
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
        barrier();
-        /*
-         * this_rq must be evaluated again because prev may have moved
+        return finish_task_switch(prev);
-         * CPUs since it called schedule(), thus the 'rq' on its stack
-         * frame will be invalid.
-         */
-        finish_task_switch(this_rq(), prev);
 }
 /*
@@ -2826,15 +2813,8 @@ need_resched:
                rq->curr = next;
                ++*switch_count;
-                context_switch(rq, prev, next); /* unlocks the rq */
+                rq = context_switch(rq, prev, next); /* unlocks the rq */
-                /*
+                cpu = cpu_of(rq);
-                 * The context switch have flipped the stack from under us
-                 * and restored the local variables which were saved when
-                 * this task called schedule() in the past. prev == current
-                 * is still correct, but it can be moved to another cpu/rq.
-                 */
-                cpu = smp_processor_id();
-                rq = cpu_rq(cpu);
        } else
                raw_spin_unlock_irq(&rq->lock);
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
 #endif
 }
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+                              const struct cpumask *trial)
+{
+        int ret = 1, trial_cpus;
+        struct dl_bw *cur_dl_b;
+        unsigned long flags;
+        rcu_read_lock_sched();
+        cur_dl_b = dl_bw_of(cpumask_any(cur));
+        trial_cpus = cpumask_weight(trial);
+        raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+        if (cur_dl_b->bw != -1 &&
+            cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+                ret = 0;
+        raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+        rcu_read_unlock_sched();
+        return ret;
+}
+int task_can_attach(struct task_struct *p,
+                    const struct cpumask *cs_cpus_allowed)
+{
+        int ret = 0;
+        /*
+         * Kthreads which disallow setaffinity shouldn't be moved
+         * to a new cpuset; we don't want to change their cpu
+         * affinity and isolating such threads by their set of
+         * allowed nodes is unnecessary.  Thus, cpusets are not
+         * applicable for such threads.  This prevents checking for
+         * success of set_cpus_allowed_ptr() on all attached tasks
+         * before cpus_allowed may be changed.
+         */
+        if (p->flags & PF_NO_SETAFFINITY) {
+                ret = -EINVAL;
+                goto out;
+        }
+#ifdef CONFIG_SMP
+        if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+                                              cs_cpus_allowed)) {
+                unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+                                                        cs_cpus_allowed);
+                struct dl_bw *dl_b;
+                bool overflow;
+                int cpus;
+                unsigned long flags;
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(dest_cpu);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
+                cpus = dl_bw_cpus(dest_cpu);
+                overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+                if (overflow)
+                        ret = -EBUSY;
+                else {
+                        /*
+                         * We reserve space for this task in the destination
+                         * root_domain, as we can't fail after this point.
+                         * We will free resources in the source root_domain
+                         * later on (see set_cpus_allowed_dl()).
+                         */
+                        __dl_add(dl_b, p->dl.dl_bw);
+                }
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
+        }
+#endif
+out:
+        return ret;
+}
 #ifdef CONFIG_SMP
 /*
 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
 static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 #endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
        printk(KERN_WARNING "\n");
 }
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
 {
        int i;
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
        return false;
 }
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ *   is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ *   there is an intermediary node C, which is < N hops away from both
+ *   nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+        int a, b, c, n;
+        n = sched_max_numa_distance;
+        if (n <= 1)
+                sched_numa_topology_type = NUMA_DIRECT;
+        for_each_online_node(a) {
+                for_each_online_node(b) {
+                        /* Find two nodes furthest removed from each other. */
+                        if (node_distance(a, b) < n)
+                                continue;
+                        /* Is there an intermediary node between a and b? */
+                        for_each_online_node(c) {
+                                if (node_distance(a, c) < n &&
+                                    node_distance(b, c) < n) {
+                                        sched_numa_topology_type =
+                                                        NUMA_GLUELESS_MESH;
+                                        return;
+                                }
+                        }
+                        sched_numa_topology_type = NUMA_BACKPLANE;
+                        return;
+                }
+        }
+}
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
        sched_domain_topology = tl;
        sched_domains_numa_levels = level;
+        sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+        init_numa_topology_type();
 }
 static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
+        /*
+         * Blocking primitives will set (and therefore destroy) current->state,
+         * since we will exit with TASK_RUNNING make sure we enter with it,
+         * otherwise we will destroy state.
+         */
+        if (WARN_ONCE(current->state != TASK_RUNNING,
+                        "do not call blocking ops when !TASK_RUNNING; "
+                        "state=%lx set at [<%p>] %pS\n",
+                        current->state,
+                        (void *)current->task_state_change,
+                        (void *)current->task_state_change))
+                __set_current_state(TASK_RUNNING);
+        ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
        static unsigned long prev_jiffy;        /* ratelimiting */
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 #endif
        dump_stack();
 }
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-10 00:21:34 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-10 00:21:34 -0500
commit	86c6a2fddf0b89b494c7616f2c06cf915c4bff01 (patch)
tree	0e6930c93e5d49ead71b17fcadf0cc9ba28c3d2d /kernel/sched/core.c
parent	bee2782f30f66898be3f74ad02e4d1f87a969694 (diff)
parent	fd7de1e8d5b2b2b35e71332fafb899f584597150 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e67a6e88e125..bb398c0c5f08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008	return cpu_curr(task_cpu(p)) == p;	1008	return cpu_curr(task_cpu(p)) == p;
1009	}	1009	}
1010		1010
		1011	/*
		1012	* Can drop rq->lock because from sched_class::switched_from() methods drop it.
		1013	*/
1011	static inline void check_class_changed(struct rq rq, struct task_struct p,	1014	static inline void check_class_changed(struct rq rq, struct task_struct p,
1012	const struct sched_class *prev_class,	1015	const struct sched_class *prev_class,
1013	int oldprio)	1016	int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq rq, struct task_struct p,
1015	if (prev_class != p->sched_class) {	1018	if (prev_class != p->sched_class) {
1016	if (prev_class->switched_from)	1019	if (prev_class->switched_from)
1017	prev_class->switched_from(rq, p);	1020	prev_class->switched_from(rq, p);
		1021	/* Possble rq->lock 'hole'. */
1018	p->sched_class->switched_to(rq, p);	1022	p->sched_class->switched_to(rq, p);
1019	} else if (oldprio != p->prio \|\| dl_task(p))	1023	} else if (oldprio != p->prio \|\| dl_task(p))
1020	p->sched_class->prio_changed(rq, p, oldprio);	1024	p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054	* ttwu() will sort out the placement.	1058	* ttwu() will sort out the placement.
1055	*/	1059	*/
1056	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&	1060	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057	!(task_preempt_count(p) & PREEMPT_ACTIVE));	1061	!p->on_rq);
1058		1062
1059	#ifdef CONFIG_LOCKDEP	1063	#ifdef CONFIG_LOCKDEP
1060	/*	1064	/*
@@ -1407,7 +1411,8 @@ out:
1407	static inline	1411	static inline
1408	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)	1412	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409	{	1413	{
1410	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);	1414	if (p->nr_cpus_allowed > 1)
		1415	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411		1416
1412	/*	1417	/*
1413	* In order not to call set_task_cpu() on a blocking task we need	1418	* In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623	struct rq *rq = cpu_rq(cpu);	1628	struct rq *rq = cpu_rq(cpu);
1624	unsigned long flags;	1629	unsigned long flags;
1625		1630
1626	if (!is_idle_task(rq->curr))	1631	rcu_read_lock();
1627	return;	1632
		1633	if (!is_idle_task(rcu_dereference(rq->curr)))
		1634	goto out;
1628		1635
1629	if (set_nr_if_polling(rq->idle)) {	1636	if (set_nr_if_polling(rq->idle)) {
1630	trace_sched_wake_idle_without_ipi(cpu);	1637	trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635	/* Else cpu is not in idle, do nothing here */	1642	/* Else cpu is not in idle, do nothing here */
1636	raw_spin_unlock_irqrestore(&rq->lock, flags);	1643	raw_spin_unlock_irqrestore(&rq->lock, flags);
1637	}	1644	}
		1645
		1646	out:
		1647	rcu_read_unlock();
1638	}	1648	}
1639		1649
1640	bool cpus_share_cache(int this_cpu, int that_cpu)	1650	bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;	1863	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854	p->numa_scan_period = sysctl_numa_balancing_scan_delay;	1864	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855	p->numa_work.next = &p->numa_work;	1865	p->numa_work.next = &p->numa_work;
1856	p->numa_faults_memory = NULL;	1866	p->numa_faults = NULL;
1857	p->numa_faults_buffer_memory = NULL;
1858	p->last_task_numa_placement = 0;	1867	p->last_task_numa_placement = 0;
1859	p->last_sum_exec_runtime = 0;	1868	p->last_sum_exec_runtime = 0;
1860		1869
1861	INIT_LIST_HEAD(&p->numa_entry);
1862	p->numa_group = NULL;	1870	p->numa_group = NULL;
1863	#endif /* CONFIG_NUMA_BALANCING */	1871	#endif /* CONFIG_NUMA_BALANCING */
1864	}	1872	}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034	}	2042	}
2035	#endif	2043	#endif
2036		2044
2037	static inline
2038	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039	{
2040	dl_b->total_bw -= tsk_bw;
2041	}
2042
2043	static inline
2044	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045	{
2046	dl_b->total_bw += tsk_bw;
2047	}
2048
2049	static inline
2050	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051	{
2052	return dl_b->bw != -1 &&
2053	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054	}
2055
2056	/*	2045	/*
2057	* We must be sure that accepting a new task (or allowing changing the	2046	* We must be sure that accepting a new task (or allowing changing the
2058	* parameters of an existing one) is consistent with the bandwidth	2047	* parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq rq, struct task_struct prev,
2220		2209
2221	/**	2210	/**
2222	* finish_task_switch - clean up after a task-switch	2211	* finish_task_switch - clean up after a task-switch
2223	* @rq: runqueue associated with task-switch
2224	* @prev: the thread we just switched away from.	2212	* @prev: the thread we just switched away from.
2225	*	2213	*
2226	* finish_task_switch must be called after the context switch, paired	2214	* finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq rq, struct task_struct prev,
2232	* so, we finish that here outside of the runqueue lock. (Doing it	2220	* so, we finish that here outside of the runqueue lock. (Doing it
2233	* with the lock held can cause deadlocks; see schedule() for	2221	* with the lock held can cause deadlocks; see schedule() for
2234	* details.)	2222	* details.)
		2223	*
		2224	* The context switch have flipped the stack from under us and restored the
		2225	* local variables which were saved when this task called schedule() in the
		2226	* past. prev == current is still correct but we need to recalculate this_rq
		2227	* because prev may have moved to another CPU.
2235	*/	2228	*/
2236	static void finish_task_switch(struct rq rq, struct task_struct prev)	2229	static struct rq finish_task_switch(struct task_struct prev)
2237	__releases(rq->lock)	2230	__releases(rq->lock)
2238	{	2231	{
		2232	struct rq *rq = this_rq();
2239	struct mm_struct *mm = rq->prev_mm;	2233	struct mm_struct *mm = rq->prev_mm;
2240	long prev_state;	2234	long prev_state;
2241		2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
2275	}	2269	}
2276		2270
2277	tick_nohz_task_switch(current);	2271	tick_nohz_task_switch(current);
		2272	return rq;
2278	}	2273	}
2279		2274
2280	#ifdef CONFIG_SMP	2275	#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309	asmlinkage __visible void schedule_tail(struct task_struct *prev)	2304	asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310	__releases(rq->lock)	2305	__releases(rq->lock)
2311	{	2306	{
2312	struct rq *rq = this_rq();	2307	struct rq *rq;
2313
2314	finish_task_switch(rq, prev);
2315		2308
2316	/*	2309	/* finish_task_switch() drops rq->lock and enables preemtion */
2317	* FIXME: do we need to worry about rq being invalidated by the	2310	preempt_disable();
2318	* task_switch?	2311	rq = finish_task_switch(prev);
2319	*/
2320	post_schedule(rq);	2312	post_schedule(rq);
		2313	preempt_enable();
2321		2314
2322	if (current->set_child_tid)	2315	if (current->set_child_tid)
2323	put_user(task_pid_vnr(current), current->set_child_tid);	2316	put_user(task_pid_vnr(current), current->set_child_tid);
2324	}	2317	}
2325		2318
2326	/*	2319	/*
2327	* context_switch - switch to the new MM and the new	2320	* context_switch - switch to the new MM and the new thread's register state.
2328	* thread's register state.
2329	*/	2321	*/
2330	static inline void	2322	static inline struct rq *
2331	context_switch(struct rq rq, struct task_struct prev,	2323	context_switch(struct rq rq, struct task_struct prev,
2332	struct task_struct *next)	2324	struct task_struct *next)
2333	{	2325	{
@@ -2366,14 +2358,9 @@ context_switch(struct rq rq, struct task_struct prev,
2366	context_tracking_task_switch(prev, next);	2358	context_tracking_task_switch(prev, next);
2367	/* Here we just switch the register state and the stack. */	2359	/* Here we just switch the register state and the stack. */
2368	switch_to(prev, next, prev);	2360	switch_to(prev, next, prev);
2369
2370	barrier();	2361	barrier();
2371	/*	2362
2372	* this_rq must be evaluated again because prev may have moved	2363	return finish_task_switch(prev);
2373	* CPUs since it called schedule(), thus the 'rq' on its stack
2374	* frame will be invalid.
2375	*/
2376	finish_task_switch(this_rq(), prev);
2377	}	2364	}
2378		2365
2379	/*	2366	/*
@@ -2826,15 +2813,8 @@ need_resched:
2826	rq->curr = next;	2813	rq->curr = next;
2827	++*switch_count;	2814	++*switch_count;
2828		2815
2829	context_switch(rq, prev, next); /* unlocks the rq */	2816	rq = context_switch(rq, prev, next); /* unlocks the rq */
2830	/*	2817	cpu = cpu_of(rq);
2831	* The context switch have flipped the stack from under us
2832	* and restored the local variables which were saved when
2833	* this task called schedule() in the past. prev == current
2834	* is still correct, but it can be moved to another cpu/rq.
2835	*/
2836	cpu = smp_processor_id();
2837	rq = cpu_rq(cpu);
2838	} else	2818	} else
2839	raw_spin_unlock_irq(&rq->lock);	2819	raw_spin_unlock_irq(&rq->lock);
2840		2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653	#endif	4633	#endif
4654	}	4634	}
4655		4635
		4636	int cpuset_cpumask_can_shrink(const struct cpumask *cur,
		4637	const struct cpumask *trial)
		4638	{
		4639	int ret = 1, trial_cpus;
		4640	struct dl_bw *cur_dl_b;
		4641	unsigned long flags;
		4642
		4643	rcu_read_lock_sched();
		4644	cur_dl_b = dl_bw_of(cpumask_any(cur));
		4645	trial_cpus = cpumask_weight(trial);
		4646
		4647	raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
		4648	if (cur_dl_b->bw != -1 &&
		4649	cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
		4650	ret = 0;
		4651	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
		4652	rcu_read_unlock_sched();
		4653
		4654	return ret;
		4655	}
		4656
		4657	int task_can_attach(struct task_struct *p,
		4658	const struct cpumask *cs_cpus_allowed)
		4659	{
		4660	int ret = 0;
		4661
		4662	/*
		4663	* Kthreads which disallow setaffinity shouldn't be moved
		4664	* to a new cpuset; we don't want to change their cpu
		4665	* affinity and isolating such threads by their set of
		4666	* allowed nodes is unnecessary. Thus, cpusets are not
		4667	* applicable for such threads. This prevents checking for
		4668	* success of set_cpus_allowed_ptr() on all attached tasks
		4669	* before cpus_allowed may be changed.
		4670	*/
		4671	if (p->flags & PF_NO_SETAFFINITY) {
		4672	ret = -EINVAL;
		4673	goto out;
		4674	}
		4675
		4676	#ifdef CONFIG_SMP
		4677	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
		4678	cs_cpus_allowed)) {
		4679	unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
		4680	cs_cpus_allowed);
		4681	struct dl_bw *dl_b;
		4682	bool overflow;
		4683	int cpus;
		4684	unsigned long flags;
		4685
		4686	rcu_read_lock_sched();
		4687	dl_b = dl_bw_of(dest_cpu);
		4688	raw_spin_lock_irqsave(&dl_b->lock, flags);
		4689	cpus = dl_bw_cpus(dest_cpu);
		4690	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
		4691	if (overflow)
		4692	ret = -EBUSY;
		4693	else {
		4694	/*
		4695	* We reserve space for this task in the destination
		4696	* root_domain, as we can't fail after this point.
		4697	* We will free resources in the source root_domain
		4698	* later on (see set_cpus_allowed_dl()).
		4699	*/
		4700	__dl_add(dl_b, p->dl.dl_bw);
		4701	}
		4702	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
		4703	rcu_read_unlock_sched();
		4704
		4705	}
		4706	#endif
		4707	out:
		4708	return ret;
		4709	}
		4710
4656	#ifdef CONFIG_SMP	4711	#ifdef CONFIG_SMP
4657	/*	4712	/*
4658	* move_queued_task - move a queued task to new rq.	4713	* move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103		6158
6104	#ifdef CONFIG_NUMA	6159	#ifdef CONFIG_NUMA
6105	static int sched_domains_numa_levels;	6160	static int sched_domains_numa_levels;
		6161	enum numa_topology_type sched_numa_topology_type;
6106	static int *sched_domains_numa_distance;	6162	static int *sched_domains_numa_distance;
		6163	int sched_max_numa_distance;
6107	static struct cpumask ***sched_domains_numa_masks;	6164	static struct cpumask ***sched_domains_numa_masks;
6108	static int sched_domains_curr_level;	6165	static int sched_domains_curr_level;
6109	#endif	6166	#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275	printk(KERN_WARNING "\n");	6332	printk(KERN_WARNING "\n");
6276	}	6333	}
6277		6334
6278	static bool find_numa_distance(int distance)	6335	bool find_numa_distance(int distance)
6279	{	6336	{
6280	int i;	6337	int i;
6281		6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290	return false;	6347	return false;
6291	}	6348	}
6292		6349
		6350	/*
		6351	* A system can have three types of NUMA topology:
		6352	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
		6353	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
		6354	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
		6355	*
		6356	* The difference between a glueless mesh topology and a backplane
		6357	* topology lies in whether communication between not directly
		6358	* connected nodes goes through intermediary nodes (where programs
		6359	* could run), or through backplane controllers. This affects
		6360	* placement of programs.
		6361	*
		6362	* The type of topology can be discerned with the following tests:
		6363	* - If the maximum distance between any nodes is 1 hop, the system
		6364	* is directly connected.
		6365	* - If for two nodes A and B, located N > 1 hops away from each other,
		6366	* there is an intermediary node C, which is < N hops away from both
		6367	* nodes A and B, the system is a glueless mesh.
		6368	*/
		6369	static void init_numa_topology_type(void)
		6370	{
		6371	int a, b, c, n;
		6372
		6373	n = sched_max_numa_distance;
		6374
		6375	if (n <= 1)
		6376	sched_numa_topology_type = NUMA_DIRECT;
		6377
		6378	for_each_online_node(a) {
		6379	for_each_online_node(b) {
		6380	/* Find two nodes furthest removed from each other. */
		6381	if (node_distance(a, b) < n)
		6382	continue;
		6383
		6384	/* Is there an intermediary node between a and b? */
		6385	for_each_online_node(c) {
		6386	if (node_distance(a, c) < n &&
		6387	node_distance(b, c) < n) {
		6388	sched_numa_topology_type =
		6389	NUMA_GLUELESS_MESH;
		6390	return;
		6391	}
		6392	}
		6393
		6394	sched_numa_topology_type = NUMA_BACKPLANE;
		6395	return;
		6396	}
		6397	}
		6398	}
		6399
6293	static void sched_init_numa(void)	6400	static void sched_init_numa(void)
6294	{	6401	{
6295	int next_distance, curr_distance = node_distance(0, 0);	6402	int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426	sched_domain_topology = tl;	6533	sched_domain_topology = tl;
6427		6534
6428	sched_domains_numa_levels = level;	6535	sched_domains_numa_levels = level;
		6536	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
		6537
		6538	init_numa_topology_type();
6429	}	6539	}
6430		6540
6431	static void sched_domains_numa_masks_set(int cpu)	6541	static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178		7288
7179	void __might_sleep(const char *file, int line, int preempt_offset)	7289	void __might_sleep(const char *file, int line, int preempt_offset)
7180	{	7290	{
		7291	/*
		7292	* Blocking primitives will set (and therefore destroy) current->state,
		7293	* since we will exit with TASK_RUNNING make sure we enter with it,
		7294	* otherwise we will destroy state.
		7295	*/
		7296	if (WARN_ONCE(current->state != TASK_RUNNING,
		7297	"do not call blocking ops when !TASK_RUNNING; "
		7298	"state=%lx set at [<%p>] %pS\n",
		7299	current->state,
		7300	(void *)current->task_state_change,
		7301	(void *)current->task_state_change))
		7302	__set_current_state(TASK_RUNNING);
		7303
		7304	___might_sleep(file, line, preempt_offset);
		7305	}
		7306	EXPORT_SYMBOL(__might_sleep);
		7307
		7308	void ___might_sleep(const char *file, int line, int preempt_offset)
		7309	{
7181	static unsigned long prev_jiffy; /* ratelimiting */	7310	static unsigned long prev_jiffy; /* ratelimiting */
7182		7311
7183	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */	7312	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209	#endif	7338	#endif
7210	dump_stack();	7339	dump_stack();
7211	}	7340	}
7212	EXPORT_SYMBOL(__might_sleep);	7341	EXPORT_SYMBOL(___might_sleep);
7213	#endif	7342	#endif
7214		7343
7215	#ifdef CONFIG_MAGIC_SYSRQ	7344	#ifdef CONFIG_MAGIC_SYSRQ