4 files changed, 73 insertions, 29 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 039baa4cd90c..bceb89557973 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1890,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void)
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        scan_for_empty_cpusets(&top_cpuset);
+        /*
+         * Scheduler destroys domains on hotplug events.
+         * Rebuild them based on the current settings.
+         */
+        rebuild_sched_domains();
        cgroup_unlock();
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..4a3cb0614158 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1127,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        return HRTIMER_NORESTART;
 }
+#ifdef CONFIG_SMP
 static void hotplug_hrtick_disable(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -1182,6 +1183,7 @@ static void init_hrtick(void)
 {
        hotcpu_notifier(hotplug_hrtick, 0);
 }
+#endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
@@ -7236,6 +7238,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 /*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+        ndoms_cur = 0;
+        if (doms_cur != &fallback_doms)
+                kfree(doms_cur);
+        doms_cur = &fallback_doms;
+}
+/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
@@ -7382,6 +7396,7 @@ int arch_reinit_sched_domains(void)
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        detach_destroy_domains(&cpu_online_map);
+        free_sched_domains();
        err = arch_init_sched_domains(&cpu_online_map);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
@@ -7467,6 +7482,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                detach_destroy_domains(&cpu_online_map);
+                free_sched_domains();
                return NOTIFY_OK;
        case CPU_UP_CANCELED:
@@ -7485,8 +7501,16 @@ static int update_sched_domains(struct notifier_block *nfb,
                return NOTIFY_DONE;
        }
+#ifndef CONFIG_CPUSETS
+        /*
+         * Create default domain partitioning if cpusets are disabled.
+         * Otherwise we let cpusets rebuild the domains based on the
+         * current setup.
+         */
        /* The hotplug lock is already held by cpu_up/cpu_down */
        arch_init_sched_domains(&cpu_online_map);
+#endif
        return NOTIFY_OK;
 }
@@ -7626,7 +7650,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        else
                rt_se->rt_rq = parent->my_q;
-        rt_se->rt_rq = &rq->rt;
        rt_se->my_q = rt_rq;
        rt_se->parent = parent;
        INIT_LIST_HEAD(&rt_se->run_list);
@@ -8348,7 +8371,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-        struct task_group *tgi, *parent = tg->parent;
+        struct task_group *tgi, *parent = tg ? tg->parent : NULL;
        unsigned long total = 0;
        if (!parent) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..1dad5bbb59b6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -449,13 +449,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 #endif
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
-        if (group_rq && rt_rq_throttled(group_rq))
+        /*
+         * Don't enqueue the group if its throttled, or when empty.
+         * The latter is a consequence of the former when a child group
+         * get throttled and the current group doesn't have any other
+         * active members.
+         */
+        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -464,7 +470,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
        inc_rt_tasks(rt_se, rt_rq);
 }
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +486,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 * Because the prio of an upper entry depends on the lower
 * entries, we must remove entries top - down.
 */
-static void dequeue_rt_stack(struct task_struct *p)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 {
-        struct sched_rt_entity *rt_se, *back = NULL;
+        struct sched_rt_entity *back = NULL;
-        rt_se = &p->rt;
        for_each_sched_rt_entity(rt_se) {
                rt_se->back = back;
                back = rt_se;
@@ -492,7 +497,26 @@ static void dequeue_rt_stack(struct task_struct *p)
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
-                        dequeue_rt_entity(rt_se);
+                        __dequeue_rt_entity(rt_se);
+        }
+}
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+        dequeue_rt_stack(rt_se);
+        for_each_sched_rt_entity(rt_se)
+                __enqueue_rt_entity(rt_se);
+}
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+        dequeue_rt_stack(rt_se);
+        for_each_sched_rt_entity(rt_se) {
+                struct rt_rq *rt_rq = group_rt_rq(rt_se);
+                if (rt_rq && rt_rq->rt_nr_running)
+                        __enqueue_rt_entity(rt_se);
        }
 }
@@ -506,32 +530,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
        if (wakeup)
                rt_se->timeout = 0;
-        dequeue_rt_stack(p);
+        enqueue_rt_entity(rt_se);
-        /*
-         * enqueue everybody, bottom - up.
-         */
-        for_each_sched_rt_entity(rt_se)
-                enqueue_rt_entity(rt_se);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
        struct sched_rt_entity *rt_se = &p->rt;
-        struct rt_rq *rt_rq;
        update_curr_rt(rq);
+        dequeue_rt_entity(rt_se);
-        dequeue_rt_stack(p);
-        /*
-         * re-enqueue all non-empty rt_rq entities.
-         */
-        for_each_sched_rt_entity(rt_se) {
-                rt_rq = group_rt_rq(rt_se);
-                if (rt_rq && rt_rq->rt_nr_running)
-                        enqueue_rt_entity(rt_se);
-        }
 }
 /*
@@ -542,8 +549,10 @@ static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
        struct rt_prio_array *array = &rt_rq->active;
+        struct list_head *queue = array->queue + rt_se_prio(rt_se);
-        list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+        if (on_rt_rq(rt_se))
+                list_move_tail(&rt_se->run_list, queue);
 }
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index a38878e0e49d..80179ef7450e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -198,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t)
 /*
 * Called when a process ceases being the active-running process, either
 * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
 */
 static inline void sched_info_depart(struct task_struct *t)
 {
@@ -206,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t)
        t->sched_info.cpu_time += delta;
        rq_sched_info_depart(task_rq(t), delta);
+        if (t->state == TASK_RUNNING)
+                sched_info_queued(t);
 }
 /*

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 039baa4cd90c..bceb89557973 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -1890,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void)
1890	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];	1890	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1891	scan_for_empty_cpusets(&top_cpuset);	1891	scan_for_empty_cpusets(&top_cpuset);
1892		1892
		1893	/*
		1894	* Scheduler destroys domains on hotplug events.
		1895	* Rebuild them based on the current settings.
		1896	*/
		1897	rebuild_sched_domains();
		1898
1893	cgroup_unlock();	1899	cgroup_unlock();
1894	}	1900	}
1895		1901


diff --git a/kernel/sched.c b/kernel/sched.c index eaf6751e7612..4a3cb0614158 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1127,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1127	return HRTIMER_NORESTART;	1127	return HRTIMER_NORESTART;
1128	}	1128	}
1129		1129
		1130	#ifdef CONFIG_SMP
1130	static void hotplug_hrtick_disable(int cpu)	1131	static void hotplug_hrtick_disable(int cpu)
1131	{	1132	{
1132	struct rq *rq = cpu_rq(cpu);	1133	struct rq *rq = cpu_rq(cpu);
@@ -1182,6 +1183,7 @@ static void init_hrtick(void)
1182	{	1183	{
1183	hotcpu_notifier(hotplug_hrtick, 0);	1184	hotcpu_notifier(hotplug_hrtick, 0);
1184	}	1185	}
		1186	#endif /* CONFIG_SMP */
1185		1187
1186	static void init_rq_hrtick(struct rq *rq)	1188	static void init_rq_hrtick(struct rq *rq)
1187	{	1189	{
@@ -7236,6 +7238,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7236	}	7238	}
7237		7239
7238	/*	7240	/*
		7241	* Free current domain masks.
		7242	* Called after all cpus are attached to NULL domain.
		7243	*/
		7244	static void free_sched_domains(void)
		7245	{
		7246	ndoms_cur = 0;
		7247	if (doms_cur != &fallback_doms)
		7248	kfree(doms_cur);
		7249	doms_cur = &fallback_doms;
		7250	}
		7251
		7252	/*
7239	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	7253	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
7240	* For now this just excludes isolated cpus, but could be used to	7254	* For now this just excludes isolated cpus, but could be used to
7241	* exclude other special cases in the future.	7255	* exclude other special cases in the future.
@@ -7382,6 +7396,7 @@ int arch_reinit_sched_domains(void)
7382	get_online_cpus();	7396	get_online_cpus();
7383	mutex_lock(&sched_domains_mutex);	7397	mutex_lock(&sched_domains_mutex);
7384	detach_destroy_domains(&cpu_online_map);	7398	detach_destroy_domains(&cpu_online_map);
		7399	free_sched_domains();
7385	err = arch_init_sched_domains(&cpu_online_map);	7400	err = arch_init_sched_domains(&cpu_online_map);
7386	mutex_unlock(&sched_domains_mutex);	7401	mutex_unlock(&sched_domains_mutex);
7387	put_online_cpus();	7402	put_online_cpus();
@@ -7467,6 +7482,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7467	case CPU_DOWN_PREPARE:	7482	case CPU_DOWN_PREPARE:
7468	case CPU_DOWN_PREPARE_FROZEN:	7483	case CPU_DOWN_PREPARE_FROZEN:
7469	detach_destroy_domains(&cpu_online_map);	7484	detach_destroy_domains(&cpu_online_map);
		7485	free_sched_domains();
7470	return NOTIFY_OK;	7486	return NOTIFY_OK;
7471		7487
7472	case CPU_UP_CANCELED:	7488	case CPU_UP_CANCELED:
@@ -7485,8 +7501,16 @@ static int update_sched_domains(struct notifier_block *nfb,
7485	return NOTIFY_DONE;	7501	return NOTIFY_DONE;
7486	}	7502	}
7487		7503
		7504	#ifndef CONFIG_CPUSETS
		7505	/*
		7506	* Create default domain partitioning if cpusets are disabled.
		7507	* Otherwise we let cpusets rebuild the domains based on the
		7508	* current setup.
		7509	*/
		7510
7488	/* The hotplug lock is already held by cpu_up/cpu_down */	7511	/* The hotplug lock is already held by cpu_up/cpu_down */
7489	arch_init_sched_domains(&cpu_online_map);	7512	arch_init_sched_domains(&cpu_online_map);
		7513	#endif
7490		7514
7491	return NOTIFY_OK;	7515	return NOTIFY_OK;
7492	}	7516	}
@@ -7626,7 +7650,6 @@ static void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,
7626	else	7650	else
7627	rt_se->rt_rq = parent->my_q;	7651	rt_se->rt_rq = parent->my_q;
7628		7652
7629	rt_se->rt_rq = &rq->rt;
7630	rt_se->my_q = rt_rq;	7653	rt_se->my_q = rt_rq;
7631	rt_se->parent = parent;	7654	rt_se->parent = parent;
7632	INIT_LIST_HEAD(&rt_se->run_list);	7655	INIT_LIST_HEAD(&rt_se->run_list);
@@ -8348,7 +8371,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8348	#ifdef CONFIG_CGROUP_SCHED	8371	#ifdef CONFIG_CGROUP_SCHED
8349	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)	8372	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8350	{	8373	{
8351	struct task_group tgi, parent = tg->parent;	8374	struct task_group tgi, parent = tg ? tg->parent : NULL;
8352	unsigned long total = 0;	8375	unsigned long total = 0;
8353		8376
8354	if (!parent) {	8377	if (!parent) {


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d573205d..1dad5bbb59b6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -449,13 +449,19 @@ void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
449	#endif	449	#endif
450	}	450	}
451		451
452	static void enqueue_rt_entity(struct sched_rt_entity *rt_se)	452	static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
453	{	453	{
454	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);	454	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
455	struct rt_prio_array *array = &rt_rq->active;	455	struct rt_prio_array *array = &rt_rq->active;
456	struct rt_rq *group_rq = group_rt_rq(rt_se);	456	struct rt_rq *group_rq = group_rt_rq(rt_se);
457		457
458	if (group_rq && rt_rq_throttled(group_rq))	458	/*
		459	* Don't enqueue the group if its throttled, or when empty.
		460	* The latter is a consequence of the former when a child group
		461	* get throttled and the current group doesn't have any other
		462	* active members.
		463	*/
		464	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running))
459	return;	465	return;
460		466
461	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));	467	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -464,7 +470,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
464	inc_rt_tasks(rt_se, rt_rq);	470	inc_rt_tasks(rt_se, rt_rq);
465	}	471	}
466		472
467	static void dequeue_rt_entity(struct sched_rt_entity *rt_se)	473	static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
468	{	474	{
469	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);	475	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
470	struct rt_prio_array *array = &rt_rq->active;	476	struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +486,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
480	* Because the prio of an upper entry depends on the lower	486	* Because the prio of an upper entry depends on the lower
481	* entries, we must remove entries top - down.	487	* entries, we must remove entries top - down.
482	*/	488	*/
483	static void dequeue_rt_stack(struct task_struct *p)	489	static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
484	{	490	{
485	struct sched_rt_entity rt_se, back = NULL;	491	struct sched_rt_entity *back = NULL;
486		492
487	rt_se = &p->rt;
488	for_each_sched_rt_entity(rt_se) {	493	for_each_sched_rt_entity(rt_se) {
489	rt_se->back = back;	494	rt_se->back = back;
490	back = rt_se;	495	back = rt_se;
@@ -492,7 +497,26 @@ static void dequeue_rt_stack(struct task_struct *p)
492		497
493	for (rt_se = back; rt_se; rt_se = rt_se->back) {	498	for (rt_se = back; rt_se; rt_se = rt_se->back) {
494	if (on_rt_rq(rt_se))	499	if (on_rt_rq(rt_se))
495	dequeue_rt_entity(rt_se);	500	__dequeue_rt_entity(rt_se);
		501	}
		502	}
		503
		504	static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
		505	{
		506	dequeue_rt_stack(rt_se);
		507	for_each_sched_rt_entity(rt_se)
		508	__enqueue_rt_entity(rt_se);
		509	}
		510
		511	static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
		512	{
		513	dequeue_rt_stack(rt_se);
		514
		515	for_each_sched_rt_entity(rt_se) {
		516	struct rt_rq *rt_rq = group_rt_rq(rt_se);
		517
		518	if (rt_rq && rt_rq->rt_nr_running)
		519	__enqueue_rt_entity(rt_se);
496	}	520	}
497	}	521	}
498		522
@@ -506,32 +530,15 @@ static void enqueue_task_rt(struct rq rq, struct task_struct p, int wakeup)
506	if (wakeup)	530	if (wakeup)
507	rt_se->timeout = 0;	531	rt_se->timeout = 0;
508		532
509	dequeue_rt_stack(p);	533	enqueue_rt_entity(rt_se);
510
511	/*
512	* enqueue everybody, bottom - up.
513	*/
514	for_each_sched_rt_entity(rt_se)
515	enqueue_rt_entity(rt_se);
516	}	534	}
517		535
518	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)	536	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)
519	{	537	{
520	struct sched_rt_entity *rt_se = &p->rt;	538	struct sched_rt_entity *rt_se = &p->rt;
521	struct rt_rq *rt_rq;
522		539
523	update_curr_rt(rq);	540	update_curr_rt(rq);
524		541	dequeue_rt_entity(rt_se);
525	dequeue_rt_stack(p);
526
527	/*
528	* re-enqueue all non-empty rt_rq entities.
529	*/
530	for_each_sched_rt_entity(rt_se) {
531	rt_rq = group_rt_rq(rt_se);
532	if (rt_rq && rt_rq->rt_nr_running)
533	enqueue_rt_entity(rt_se);
534	}
535	}	542	}
536		543
537	/*	544	/*
@@ -542,8 +549,10 @@ static
542	void requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se)	549	void requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se)
543	{	550	{
544	struct rt_prio_array *array = &rt_rq->active;	551	struct rt_prio_array *array = &rt_rq->active;
		552	struct list_head *queue = array->queue + rt_se_prio(rt_se);
545		553
546	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));	554	if (on_rt_rq(rt_se))
		555	list_move_tail(&rt_se->run_list, queue);
547	}	556	}
548		557
549	static void requeue_task_rt(struct rq rq, struct task_struct p)	558	static void requeue_task_rt(struct rq rq, struct task_struct p)


diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index a38878e0e49d..80179ef7450e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h
@@ -198,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t)
198	/*	198	/*
199	* Called when a process ceases being the active-running process, either	199	* Called when a process ceases being the active-running process, either
200	* voluntarily or involuntarily. Now we can calculate how long we ran.	200	* voluntarily or involuntarily. Now we can calculate how long we ran.
		201	* Also, if the process is still in the TASK_RUNNING state, call
		202	* sched_info_queued() to mark that it has now again started waiting on
		203	* the runqueue.
201	*/	204	*/
202	static inline void sched_info_depart(struct task_struct *t)	205	static inline void sched_info_depart(struct task_struct *t)
203	{	206	{
@@ -206,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t)
206		209
207	t->sched_info.cpu_time += delta;	210	t->sched_info.cpu_time += delta;
208	rq_sched_info_depart(task_rq(t), delta);	211	rq_sched_info_depart(task_rq(t), delta);
		212
		213	if (t->state == TASK_RUNNING)
		214	sched_info_queued(t);
209	}	215	}
210		216
211	/*	217	/*