1 files changed, 185 insertions, 56 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
+                /* Possble rq->lock 'hole'.  */
                p->sched_class->switched_to(rq, p);
        } else if (oldprio != p->prio || dl_task(p))
                p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_preempt_count(p) & PREEMPT_ACTIVE));
+                        !p->on_rq);
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1407,7 +1411,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-        cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+        if (p->nr_cpus_allowed > 1)
+                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        if (!is_idle_task(rq->curr))
+        rcu_read_lock();
-                return;
+        if (!is_idle_task(rcu_dereference(rq->curr)))
+                goto out;
        if (set_nr_if_polling(rq->idle)) {
                trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
                /* Else cpu is not in idle, do nothing here */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
+out:
+        rcu_read_unlock();
 }
 bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults_memory = NULL;
+        p->numa_faults = NULL;
-        p->numa_faults_buffer_memory = NULL;
        p->last_task_numa_placement = 0;
        p->last_sum_exec_runtime = 0;
-        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
 }
 #endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw -= tsk_bw;
-}
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw += tsk_bw;
-}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-        return dl_b->bw != -1 &&
-               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
 /*
 * We must be sure that accepting a new task (or allowing changing the
 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 /**
 * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
 */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
        __releases(rq->lock)
 {
+        struct rq *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        }
        tick_nohz_task_switch(current);
+        return rq;
 }
 #ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
-        struct rq *rq = this_rq();
+        struct rq *rq;
-        finish_task_switch(rq, prev);
-        /*
+        /* finish_task_switch() drops rq->lock and enables preemtion */
-         * FIXME: do we need to worry about rq being invalidated by the
+        preempt_disable();
-         * task_switch?
+        rq = finish_task_switch(prev);
-         */
        post_schedule(rq);
+        preempt_enable();
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
- * context_switch - switch to the new MM and the new
+ * context_switch - switch to the new MM and the new thread's register state.
- * thread's register state.
 */
-static inline void
+static inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next)
 {
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
        barrier();
-        /*
-         * this_rq must be evaluated again because prev may have moved
+        return finish_task_switch(prev);
-         * CPUs since it called schedule(), thus the 'rq' on its stack
-         * frame will be invalid.
-         */
-        finish_task_switch(this_rq(), prev);
 }
 /*
@@ -2826,15 +2813,8 @@ need_resched:
                rq->curr = next;
                ++*switch_count;
-                context_switch(rq, prev, next); /* unlocks the rq */
+                rq = context_switch(rq, prev, next); /* unlocks the rq */
-                /*
+                cpu = cpu_of(rq);
-                 * The context switch have flipped the stack from under us
-                 * and restored the local variables which were saved when
-                 * this task called schedule() in the past. prev == current
-                 * is still correct, but it can be moved to another cpu/rq.
-                 */
-                cpu = smp_processor_id();
-                rq = cpu_rq(cpu);
        } else
                raw_spin_unlock_irq(&rq->lock);
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
 #endif
 }
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+                              const struct cpumask *trial)
+{
+        int ret = 1, trial_cpus;
+        struct dl_bw *cur_dl_b;
+        unsigned long flags;
+        rcu_read_lock_sched();
+        cur_dl_b = dl_bw_of(cpumask_any(cur));
+        trial_cpus = cpumask_weight(trial);
+        raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+        if (cur_dl_b->bw != -1 &&
+            cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+                ret = 0;
+        raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+        rcu_read_unlock_sched();
+        return ret;
+}
+int task_can_attach(struct task_struct *p,
+                    const struct cpumask *cs_cpus_allowed)
+{
+        int ret = 0;
+        /*
+         * Kthreads which disallow setaffinity shouldn't be moved
+         * to a new cpuset; we don't want to change their cpu
+         * affinity and isolating such threads by their set of
+         * allowed nodes is unnecessary.  Thus, cpusets are not
+         * applicable for such threads.  This prevents checking for
+         * success of set_cpus_allowed_ptr() on all attached tasks
+         * before cpus_allowed may be changed.
+         */
+        if (p->flags & PF_NO_SETAFFINITY) {
+                ret = -EINVAL;
+                goto out;
+        }
+#ifdef CONFIG_SMP
+        if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+                                              cs_cpus_allowed)) {
+                unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+                                                        cs_cpus_allowed);
+                struct dl_bw *dl_b;
+                bool overflow;
+                int cpus;
+                unsigned long flags;
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(dest_cpu);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
+                cpus = dl_bw_cpus(dest_cpu);
+                overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+                if (overflow)
+                        ret = -EBUSY;
+                else {
+                        /*
+                         * We reserve space for this task in the destination
+                         * root_domain, as we can't fail after this point.
+                         * We will free resources in the source root_domain
+                         * later on (see set_cpus_allowed_dl()).
+                         */
+                        __dl_add(dl_b, p->dl.dl_bw);
+                }
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
+        }
+#endif
+out:
+        return ret;
+}
 #ifdef CONFIG_SMP
 /*
 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
 static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 #endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
        printk(KERN_WARNING "\n");
 }
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
 {
        int i;
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
        return false;
 }
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ *   is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ *   there is an intermediary node C, which is < N hops away from both
+ *   nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+        int a, b, c, n;
+        n = sched_max_numa_distance;
+        if (n <= 1)
+                sched_numa_topology_type = NUMA_DIRECT;
+        for_each_online_node(a) {
+                for_each_online_node(b) {
+                        /* Find two nodes furthest removed from each other. */
+                        if (node_distance(a, b) < n)
+                                continue;
+                        /* Is there an intermediary node between a and b? */
+                        for_each_online_node(c) {
+                                if (node_distance(a, c) < n &&
+                                    node_distance(b, c) < n) {
+                                        sched_numa_topology_type =
+                                                        NUMA_GLUELESS_MESH;
+                                        return;
+                                }
+                        }
+                        sched_numa_topology_type = NUMA_BACKPLANE;
+                        return;
+                }
+        }
+}
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
        sched_domain_topology = tl;
        sched_domains_numa_levels = level;
+        sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+        init_numa_topology_type();
 }
 static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
+        /*
+         * Blocking primitives will set (and therefore destroy) current->state,
+         * since we will exit with TASK_RUNNING make sure we enter with it,
+         * otherwise we will destroy state.
+         */
+        if (WARN_ONCE(current->state != TASK_RUNNING,
+                        "do not call blocking ops when !TASK_RUNNING; "
+                        "state=%lx set at [<%p>] %pS\n",
+                        current->state,
+                        (void *)current->task_state_change,
+                        (void *)current->task_state_change))
+                __set_current_state(TASK_RUNNING);
+        ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
        static unsigned long prev_jiffy;        /* ratelimiting */
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 #endif
        dump_stack();
 }
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e67a6e88e125..bb398c0c5f08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008	return cpu_curr(task_cpu(p)) == p;	1008	return cpu_curr(task_cpu(p)) == p;
1009	}	1009	}
1010		1010
		1011	/*
		1012	* Can drop rq->lock because from sched_class::switched_from() methods drop it.
		1013	*/
1011	static inline void check_class_changed(struct rq rq, struct task_struct p,	1014	static inline void check_class_changed(struct rq rq, struct task_struct p,
1012	const struct sched_class *prev_class,	1015	const struct sched_class *prev_class,
1013	int oldprio)	1016	int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq rq, struct task_struct p,
1015	if (prev_class != p->sched_class) {	1018	if (prev_class != p->sched_class) {
1016	if (prev_class->switched_from)	1019	if (prev_class->switched_from)
1017	prev_class->switched_from(rq, p);	1020	prev_class->switched_from(rq, p);
		1021	/* Possble rq->lock 'hole'. */
1018	p->sched_class->switched_to(rq, p);	1022	p->sched_class->switched_to(rq, p);
1019	} else if (oldprio != p->prio \|\| dl_task(p))	1023	} else if (oldprio != p->prio \|\| dl_task(p))
1020	p->sched_class->prio_changed(rq, p, oldprio);	1024	p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054	* ttwu() will sort out the placement.	1058	* ttwu() will sort out the placement.
1055	*/	1059	*/
1056	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&	1060	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057	!(task_preempt_count(p) & PREEMPT_ACTIVE));	1061	!p->on_rq);
1058		1062
1059	#ifdef CONFIG_LOCKDEP	1063	#ifdef CONFIG_LOCKDEP
1060	/*	1064	/*
@@ -1407,7 +1411,8 @@ out:
1407	static inline	1411	static inline
1408	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)	1412	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409	{	1413	{
1410	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);	1414	if (p->nr_cpus_allowed > 1)
		1415	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411		1416
1412	/*	1417	/*
1413	* In order not to call set_task_cpu() on a blocking task we need	1418	* In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623	struct rq *rq = cpu_rq(cpu);	1628	struct rq *rq = cpu_rq(cpu);
1624	unsigned long flags;	1629	unsigned long flags;
1625		1630
1626	if (!is_idle_task(rq->curr))	1631	rcu_read_lock();
1627	return;	1632
		1633	if (!is_idle_task(rcu_dereference(rq->curr)))
		1634	goto out;
1628		1635
1629	if (set_nr_if_polling(rq->idle)) {	1636	if (set_nr_if_polling(rq->idle)) {
1630	trace_sched_wake_idle_without_ipi(cpu);	1637	trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635	/* Else cpu is not in idle, do nothing here */	1642	/* Else cpu is not in idle, do nothing here */
1636	raw_spin_unlock_irqrestore(&rq->lock, flags);	1643	raw_spin_unlock_irqrestore(&rq->lock, flags);
1637	}	1644	}
		1645
		1646	out:
		1647	rcu_read_unlock();
1638	}	1648	}
1639		1649
1640	bool cpus_share_cache(int this_cpu, int that_cpu)	1650	bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;	1863	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854	p->numa_scan_period = sysctl_numa_balancing_scan_delay;	1864	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855	p->numa_work.next = &p->numa_work;	1865	p->numa_work.next = &p->numa_work;
1856	p->numa_faults_memory = NULL;	1866	p->numa_faults = NULL;
1857	p->numa_faults_buffer_memory = NULL;
1858	p->last_task_numa_placement = 0;	1867	p->last_task_numa_placement = 0;
1859	p->last_sum_exec_runtime = 0;	1868	p->last_sum_exec_runtime = 0;
1860		1869
1861	INIT_LIST_HEAD(&p->numa_entry);
1862	p->numa_group = NULL;	1870	p->numa_group = NULL;
1863	#endif /* CONFIG_NUMA_BALANCING */	1871	#endif /* CONFIG_NUMA_BALANCING */
1864	}	1872	}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034	}	2042	}
2035	#endif	2043	#endif
2036		2044
2037	static inline
2038	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039	{
2040	dl_b->total_bw -= tsk_bw;
2041	}
2042
2043	static inline
2044	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045	{
2046	dl_b->total_bw += tsk_bw;
2047	}
2048
2049	static inline
2050	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051	{
2052	return dl_b->bw != -1 &&
2053	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054	}
2055
2056	/*	2045	/*
2057	* We must be sure that accepting a new task (or allowing changing the	2046	* We must be sure that accepting a new task (or allowing changing the
2058	* parameters of an existing one) is consistent with the bandwidth	2047	* parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq rq, struct task_struct prev,
2220		2209
2221	/**	2210	/**
2222	* finish_task_switch - clean up after a task-switch	2211	* finish_task_switch - clean up after a task-switch
2223	* @rq: runqueue associated with task-switch
2224	* @prev: the thread we just switched away from.	2212	* @prev: the thread we just switched away from.
2225	*	2213	*
2226	* finish_task_switch must be called after the context switch, paired	2214	* finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq rq, struct task_struct prev,
2232	* so, we finish that here outside of the runqueue lock. (Doing it	2220	* so, we finish that here outside of the runqueue lock. (Doing it
2233	* with the lock held can cause deadlocks; see schedule() for	2221	* with the lock held can cause deadlocks; see schedule() for
2234	* details.)	2222	* details.)
		2223	*
		2224	* The context switch have flipped the stack from under us and restored the
		2225	* local variables which were saved when this task called schedule() in the
		2226	* past. prev == current is still correct but we need to recalculate this_rq
		2227	* because prev may have moved to another CPU.
2235	*/	2228	*/
2236	static void finish_task_switch(struct rq rq, struct task_struct prev)	2229	static struct rq finish_task_switch(struct task_struct prev)
2237	__releases(rq->lock)	2230	__releases(rq->lock)
2238	{	2231	{
		2232	struct rq *rq = this_rq();
2239	struct mm_struct *mm = rq->prev_mm;	2233	struct mm_struct *mm = rq->prev_mm;
2240	long prev_state;	2234	long prev_state;
2241		2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
2275	}	2269	}
2276		2270
2277	tick_nohz_task_switch(current);	2271	tick_nohz_task_switch(current);
		2272	return rq;
2278	}	2273	}
2279		2274
2280	#ifdef CONFIG_SMP	2275	#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309	asmlinkage __visible void schedule_tail(struct task_struct *prev)	2304	asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310	__releases(rq->lock)	2305	__releases(rq->lock)
2311	{	2306	{
2312	struct rq *rq = this_rq();	2307	struct rq *rq;
2313
2314	finish_task_switch(rq, prev);
2315		2308
2316	/*	2309	/* finish_task_switch() drops rq->lock and enables preemtion */
2317	* FIXME: do we need to worry about rq being invalidated by the	2310	preempt_disable();
2318	* task_switch?	2311	rq = finish_task_switch(prev);
2319	*/
2320	post_schedule(rq);	2312	post_schedule(rq);
		2313	preempt_enable();
2321		2314
2322	if (current->set_child_tid)	2315	if (current->set_child_tid)
2323	put_user(task_pid_vnr(current), current->set_child_tid);	2316	put_user(task_pid_vnr(current), current->set_child_tid);
2324	}	2317	}
2325		2318
2326	/*	2319	/*
2327	* context_switch - switch to the new MM and the new	2320	* context_switch - switch to the new MM and the new thread's register state.
2328	* thread's register state.
2329	*/	2321	*/
2330	static inline void	2322	static inline struct rq *
2331	context_switch(struct rq rq, struct task_struct prev,	2323	context_switch(struct rq rq, struct task_struct prev,
2332	struct task_struct *next)	2324	struct task_struct *next)
2333	{	2325	{
@@ -2366,14 +2358,9 @@ context_switch(struct rq rq, struct task_struct prev,
2366	context_tracking_task_switch(prev, next);	2358	context_tracking_task_switch(prev, next);
2367	/* Here we just switch the register state and the stack. */	2359	/* Here we just switch the register state and the stack. */
2368	switch_to(prev, next, prev);	2360	switch_to(prev, next, prev);
2369
2370	barrier();	2361	barrier();
2371	/*	2362
2372	* this_rq must be evaluated again because prev may have moved	2363	return finish_task_switch(prev);
2373	* CPUs since it called schedule(), thus the 'rq' on its stack
2374	* frame will be invalid.
2375	*/
2376	finish_task_switch(this_rq(), prev);
2377	}	2364	}
2378		2365
2379	/*	2366	/*
@@ -2826,15 +2813,8 @@ need_resched:
2826	rq->curr = next;	2813	rq->curr = next;
2827	++*switch_count;	2814	++*switch_count;
2828		2815
2829	context_switch(rq, prev, next); /* unlocks the rq */	2816	rq = context_switch(rq, prev, next); /* unlocks the rq */
2830	/*	2817	cpu = cpu_of(rq);
2831	* The context switch have flipped the stack from under us
2832	* and restored the local variables which were saved when
2833	* this task called schedule() in the past. prev == current
2834	* is still correct, but it can be moved to another cpu/rq.
2835	*/
2836	cpu = smp_processor_id();
2837	rq = cpu_rq(cpu);
2838	} else	2818	} else
2839	raw_spin_unlock_irq(&rq->lock);	2819	raw_spin_unlock_irq(&rq->lock);
2840		2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653	#endif	4633	#endif
4654	}	4634	}
4655		4635
		4636	int cpuset_cpumask_can_shrink(const struct cpumask *cur,
		4637	const struct cpumask *trial)
		4638	{
		4639	int ret = 1, trial_cpus;
		4640	struct dl_bw *cur_dl_b;
		4641	unsigned long flags;
		4642
		4643	rcu_read_lock_sched();
		4644	cur_dl_b = dl_bw_of(cpumask_any(cur));
		4645	trial_cpus = cpumask_weight(trial);
		4646
		4647	raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
		4648	if (cur_dl_b->bw != -1 &&
		4649	cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
		4650	ret = 0;
		4651	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
		4652	rcu_read_unlock_sched();
		4653
		4654	return ret;
		4655	}
		4656
		4657	int task_can_attach(struct task_struct *p,
		4658	const struct cpumask *cs_cpus_allowed)
		4659	{
		4660	int ret = 0;
		4661
		4662	/*
		4663	* Kthreads which disallow setaffinity shouldn't be moved
		4664	* to a new cpuset; we don't want to change their cpu
		4665	* affinity and isolating such threads by their set of
		4666	* allowed nodes is unnecessary. Thus, cpusets are not
		4667	* applicable for such threads. This prevents checking for
		4668	* success of set_cpus_allowed_ptr() on all attached tasks
		4669	* before cpus_allowed may be changed.
		4670	*/
		4671	if (p->flags & PF_NO_SETAFFINITY) {
		4672	ret = -EINVAL;
		4673	goto out;
		4674	}
		4675
		4676	#ifdef CONFIG_SMP
		4677	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
		4678	cs_cpus_allowed)) {
		4679	unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
		4680	cs_cpus_allowed);
		4681	struct dl_bw *dl_b;
		4682	bool overflow;
		4683	int cpus;
		4684	unsigned long flags;
		4685
		4686	rcu_read_lock_sched();
		4687	dl_b = dl_bw_of(dest_cpu);
		4688	raw_spin_lock_irqsave(&dl_b->lock, flags);
		4689	cpus = dl_bw_cpus(dest_cpu);
		4690	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
		4691	if (overflow)
		4692	ret = -EBUSY;
		4693	else {
		4694	/*
		4695	* We reserve space for this task in the destination
		4696	* root_domain, as we can't fail after this point.
		4697	* We will free resources in the source root_domain
		4698	* later on (see set_cpus_allowed_dl()).
		4699	*/
		4700	__dl_add(dl_b, p->dl.dl_bw);
		4701	}
		4702	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
		4703	rcu_read_unlock_sched();
		4704
		4705	}
		4706	#endif
		4707	out:
		4708	return ret;
		4709	}
		4710
4656	#ifdef CONFIG_SMP	4711	#ifdef CONFIG_SMP
4657	/*	4712	/*
4658	* move_queued_task - move a queued task to new rq.	4713	* move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103		6158
6104	#ifdef CONFIG_NUMA	6159	#ifdef CONFIG_NUMA
6105	static int sched_domains_numa_levels;	6160	static int sched_domains_numa_levels;
		6161	enum numa_topology_type sched_numa_topology_type;
6106	static int *sched_domains_numa_distance;	6162	static int *sched_domains_numa_distance;
		6163	int sched_max_numa_distance;
6107	static struct cpumask ***sched_domains_numa_masks;	6164	static struct cpumask ***sched_domains_numa_masks;
6108	static int sched_domains_curr_level;	6165	static int sched_domains_curr_level;
6109	#endif	6166	#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275	printk(KERN_WARNING "\n");	6332	printk(KERN_WARNING "\n");
6276	}	6333	}
6277		6334
6278	static bool find_numa_distance(int distance)	6335	bool find_numa_distance(int distance)
6279	{	6336	{
6280	int i;	6337	int i;
6281		6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290	return false;	6347	return false;
6291	}	6348	}
6292		6349
		6350	/*
		6351	* A system can have three types of NUMA topology:
		6352	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
		6353	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
		6354	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
		6355	*
		6356	* The difference between a glueless mesh topology and a backplane
		6357	* topology lies in whether communication between not directly
		6358	* connected nodes goes through intermediary nodes (where programs
		6359	* could run), or through backplane controllers. This affects
		6360	* placement of programs.
		6361	*
		6362	* The type of topology can be discerned with the following tests:
		6363	* - If the maximum distance between any nodes is 1 hop, the system
		6364	* is directly connected.
		6365	* - If for two nodes A and B, located N > 1 hops away from each other,
		6366	* there is an intermediary node C, which is < N hops away from both
		6367	* nodes A and B, the system is a glueless mesh.
		6368	*/
		6369	static void init_numa_topology_type(void)
		6370	{
		6371	int a, b, c, n;
		6372
		6373	n = sched_max_numa_distance;
		6374
		6375	if (n <= 1)
		6376	sched_numa_topology_type = NUMA_DIRECT;
		6377
		6378	for_each_online_node(a) {
		6379	for_each_online_node(b) {
		6380	/* Find two nodes furthest removed from each other. */
		6381	if (node_distance(a, b) < n)
		6382	continue;
		6383
		6384	/* Is there an intermediary node between a and b? */
		6385	for_each_online_node(c) {
		6386	if (node_distance(a, c) < n &&
		6387	node_distance(b, c) < n) {
		6388	sched_numa_topology_type =
		6389	NUMA_GLUELESS_MESH;
		6390	return;
		6391	}
		6392	}
		6393
		6394	sched_numa_topology_type = NUMA_BACKPLANE;
		6395	return;
		6396	}
		6397	}
		6398	}
		6399
6293	static void sched_init_numa(void)	6400	static void sched_init_numa(void)
6294	{	6401	{
6295	int next_distance, curr_distance = node_distance(0, 0);	6402	int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426	sched_domain_topology = tl;	6533	sched_domain_topology = tl;
6427		6534
6428	sched_domains_numa_levels = level;	6535	sched_domains_numa_levels = level;
		6536	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
		6537
		6538	init_numa_topology_type();
6429	}	6539	}
6430		6540
6431	static void sched_domains_numa_masks_set(int cpu)	6541	static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178		7288
7179	void __might_sleep(const char *file, int line, int preempt_offset)	7289	void __might_sleep(const char *file, int line, int preempt_offset)
7180	{	7290	{
		7291	/*
		7292	* Blocking primitives will set (and therefore destroy) current->state,
		7293	* since we will exit with TASK_RUNNING make sure we enter with it,
		7294	* otherwise we will destroy state.
		7295	*/
		7296	if (WARN_ONCE(current->state != TASK_RUNNING,
		7297	"do not call blocking ops when !TASK_RUNNING; "
		7298	"state=%lx set at [<%p>] %pS\n",
		7299	current->state,
		7300	(void *)current->task_state_change,
		7301	(void *)current->task_state_change))
		7302	__set_current_state(TASK_RUNNING);
		7303
		7304	___might_sleep(file, line, preempt_offset);
		7305	}
		7306	EXPORT_SYMBOL(__might_sleep);
		7307
		7308	void ___might_sleep(const char *file, int line, int preempt_offset)
		7309	{
7181	static unsigned long prev_jiffy; /* ratelimiting */	7310	static unsigned long prev_jiffy; /* ratelimiting */
7182		7311
7183	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */	7312	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209	#endif	7338	#endif
7210	dump_stack();	7339	dump_stack();
7211	}	7340	}
7212	EXPORT_SYMBOL(__might_sleep);	7341	EXPORT_SYMBOL(___might_sleep);
7213	#endif	7342	#endif
7214		7343
7215	#ifdef CONFIG_MAGIC_SYSRQ	7344	#ifdef CONFIG_MAGIC_SYSRQ