Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix migration thread runtime bogosity sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled sched,cgroup: Fix up task_groups list sched: fix divide by zero at {thread_group,task}_times sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-08-20 13:35:05 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-08-20 13:35:05 -0400
commit: 53795ced6e270fbb5cef7b527a71ffbb69657c78 (patch)
tree: 0e0532682837493bb84a38df10bc115521226c81
parent: f78602ab7cbc902559406d2e8e21517056708295 (diff)
parent: 8f6189684eb4e85e6c593cd710693f09c944450a (diff)
5 files changed, 70 insertions, 19 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 82ad284f823b..fbf1fd098dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
 #endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+        u64 temp = (__force u64) rtime;
+        temp *= (__force u64) utime;
+        if (sizeof(cputime_t) == 4)
+                temp = div_u64(temp, (__force u32) total);
+        else
+                temp = div64_u64(temp, (__force u64) total);
+        return (__force cputime_t) temp;
+}
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         */
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(utime, rtime, total);
+        else
-                temp *= (__force u64) utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(cputime.utime, rtime, total);
+        else
-                temp *= (__force u64) cputime.utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        sig->prev_utime = max(sig->prev_utime, utime);
@@ -7246,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
 #ifdef CONFIG_CGROUP_SCHED
 struct task_group root_task_group;
+LIST_HEAD(task_groups);
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0cc03b3e70b..c219bf8d704c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 static void update_h_load(long cpu)
 {
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long now = jiffies;
+        if (rq->h_load_throttle == now)
+                return;
+        rq->h_load_throttle = now;
        rcu_read_lock();
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
        rcu_read_unlock();
@@ -4293,11 +4301,10 @@ redo:
                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
+                update_h_load(env.src_cpu);
 more_balance:
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
-                if (!env.loop)
-                        update_h_load(env.src_cpu);
                /*
                 * cur_ld_moved - load moved in current iteration
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..944cb68420e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        const struct cpumask *span;
        span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * FIXME: isolated CPUs should really leave the root task group,
+         * whether they are isolcpus or were isolated via cpusets, lest
+         * the timer run on a CPU which does not service all runqueues,
+         * potentially leaving other CPUs indefinitely throttled.  If
+         * isolation is really required, the user will turn the throttle
+         * off to kill the perturbations it causes anyway.  Meanwhile,
+         * this maintains functionality for boot and/or troubleshooting.
+         */
+        if (rt_b == &root_task_group.rt_bandwidth)
+                span = cpu_online_mask;
+#endif
        for_each_cpu(i, span) {
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d6..f6714d009e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
 struct cfs_rq;
 struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+        unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
        struct list_head leaf_rt_rq_list;
 #endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq)
+        if (stop && stop->on_rq) {
+                stop->se.exec_start = rq->clock_task;
                return stop;
+        }
        return NULL;
 }
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
+        struct task_struct *curr = rq->curr;
+        u64 delta_exec;
+        delta_exec = rq->clock_task - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        schedstat_set(curr->se.statistics.exec_max,
+                        max(curr->se.statistics.exec_max, delta_exec));
+        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
+        curr->se.exec_start = rq->clock_task;
+        cpuacct_charge(curr, delta_exec);
 }
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 static void set_curr_task_stop(struct rq *rq)
 {
+        struct task_struct *stop = rq->stop;
+        stop->se.exec_start = rq->clock_task;
 }
 static void switched_to_stop(struct rq *rq, struct task_struct *p)
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-08-20 13:35:05 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-08-20 13:35:05 -0400
commit	53795ced6e270fbb5cef7b527a71ffbb69657c78 (patch)
tree	0e0532682837493bb84a38df10bc115521226c81
parent	f78602ab7cbc902559406d2e8e21517056708295 (diff)
parent	8f6189684eb4e85e6c593cd710693f09c944450a (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82ad284f823b..fbf1fd098dc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
3142	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)	3142	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143	#endif	3143	#endif
3144		3144
		3145	static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
		3146	{
		3147	u64 temp = (__force u64) rtime;
		3148
		3149	temp *= (__force u64) utime;
		3150
		3151	if (sizeof(cputime_t) == 4)
		3152	temp = div_u64(temp, (__force u32) total);
		3153	else
		3154	temp = div64_u64(temp, (__force u64) total);
		3155
		3156	return (__force cputime_t) temp;
		3157	}
		3158
3145	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)	3159	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
3146	{	3160	{
3147	cputime_t rtime, utime = p->utime, total = utime + p->stime;	3161	cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
3151	*/	3165	*/
3152	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);	3166	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3153		3167
3154	if (total) {	3168	if (total)
3155	u64 temp = (__force u64) rtime;	3169	utime = scale_utime(utime, rtime, total);
3156		3170	else
3157	temp *= (__force u64) utime;
3158	do_div(temp, (__force u32) total);
3159	utime = (__force cputime_t) temp;
3160	} else
3161	utime = rtime;	3171	utime = rtime;
3162		3172
3163	/*	3173	/*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
3184	total = cputime.utime + cputime.stime;	3194	total = cputime.utime + cputime.stime;
3185	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);	3195	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3186		3196
3187	if (total) {	3197	if (total)
3188	u64 temp = (__force u64) rtime;	3198	utime = scale_utime(cputime.utime, rtime, total);
3189		3199	else
3190	temp *= (__force u64) cputime.utime;
3191	do_div(temp, (__force u32) total);
3192	utime = (__force cputime_t) temp;
3193	} else
3194	utime = rtime;	3200	utime = rtime;
3195		3201
3196	sig->prev_utime = max(sig->prev_utime, utime);	3202	sig->prev_utime = max(sig->prev_utime, utime);
@@ -7246,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
7246		7252
7247	#ifdef CONFIG_CGROUP_SCHED	7253	#ifdef CONFIG_CGROUP_SCHED
7248	struct task_group root_task_group;	7254	struct task_group root_task_group;
		7255	LIST_HEAD(task_groups);
7249	#endif	7256	#endif
7250		7257
7251	DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);	7258	DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..c219bf8d704c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group tg, void data)
3387		3387
3388	static void update_h_load(long cpu)	3388	static void update_h_load(long cpu)
3389	{	3389	{
		3390	struct rq *rq = cpu_rq(cpu);
		3391	unsigned long now = jiffies;
		3392
		3393	if (rq->h_load_throttle == now)
		3394	return;
		3395
		3396	rq->h_load_throttle = now;
		3397
3390	rcu_read_lock();	3398	rcu_read_lock();
3391	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);	3399	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3392	rcu_read_unlock();	3400	rcu_read_unlock();
@@ -4293,11 +4301,10 @@ redo:
4293	env.src_rq = busiest;	4301	env.src_rq = busiest;
4294	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);	4302	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4295		4303
		4304	update_h_load(env.src_cpu);
4296	more_balance:	4305	more_balance:
4297	local_irq_save(flags);	4306	local_irq_save(flags);
4298	double_rq_lock(this_rq, busiest);	4307	double_rq_lock(this_rq, busiest);
4299	if (!env.loop)
4300	update_h_load(env.src_cpu);
4301		4308
4302	/*	4309	/*
4303	* cur_ld_moved - load moved in current iteration	4310	* cur_ld_moved - load moved in current iteration


diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca01102..944cb68420e9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
788	const struct cpumask *span;	788	const struct cpumask *span;
789		789
790	span = sched_rt_period_mask();	790	span = sched_rt_period_mask();
		791	#ifdef CONFIG_RT_GROUP_SCHED
		792	/*
		793	* FIXME: isolated CPUs should really leave the root task group,
		794	* whether they are isolcpus or were isolated via cpusets, lest
		795	* the timer run on a CPU which does not service all runqueues,
		796	* potentially leaving other CPUs indefinitely throttled. If
		797	* isolation is really required, the user will turn the throttle
		798	* off to kill the perturbations it causes anyway. Meanwhile,
		799	* this maintains functionality for boot and/or troubleshooting.
		800	*/
		801	if (rt_b == &root_task_group.rt_bandwidth)
		802	span = cpu_online_mask;
		803	#endif
791	for_each_cpu(i, span) {	804	for_each_cpu(i, span) {
792	int enqueue = 0;	805	int enqueue = 0;
793	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);	806	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..f6714d009e77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
80	struct cfs_rq;	80	struct cfs_rq;
81	struct rt_rq;	81	struct rt_rq;
82		82
83	static LIST_HEAD(task_groups);	83	extern struct list_head task_groups;
84		84
85	struct cfs_bandwidth {	85	struct cfs_bandwidth {
86	#ifdef CONFIG_CFS_BANDWIDTH	86	#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
374	#ifdef CONFIG_FAIR_GROUP_SCHED	374	#ifdef CONFIG_FAIR_GROUP_SCHED
375	/* list of leaf cfs_rq on this cpu: */	375	/* list of leaf cfs_rq on this cpu: */
376	struct list_head leaf_cfs_rq_list;	376	struct list_head leaf_cfs_rq_list;
377	#endif	377	#ifdef CONFIG_SMP
		378	unsigned long h_load_throttle;
		379	#endif /* CONFIG_SMP */
		380	#endif /* CONFIG_FAIR_GROUP_SCHED */
		381
378	#ifdef CONFIG_RT_GROUP_SCHED	382	#ifdef CONFIG_RT_GROUP_SCHED
379	struct list_head leaf_rt_rq_list;	383	struct list_head leaf_rt_rq_list;
380	#endif	384	#endif


diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd23..da5eb5bed84a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct pick_next_task_stop(struct rq rq)
27	{	27	{
28	struct task_struct *stop = rq->stop;	28	struct task_struct *stop = rq->stop;
29		29
30	if (stop && stop->on_rq)	30	if (stop && stop->on_rq) {
		31	stop->se.exec_start = rq->clock_task;
31	return stop;	32	return stop;
		33	}
32		34
33	return NULL;	35	return NULL;
34	}	36	}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
52		54
53	static void put_prev_task_stop(struct rq rq, struct task_struct prev)	55	static void put_prev_task_stop(struct rq rq, struct task_struct prev)
54	{	56	{
		57	struct task_struct *curr = rq->curr;
		58	u64 delta_exec;
		59
		60	delta_exec = rq->clock_task - curr->se.exec_start;
		61	if (unlikely((s64)delta_exec < 0))
		62	delta_exec = 0;
		63
		64	schedstat_set(curr->se.statistics.exec_max,
		65	max(curr->se.statistics.exec_max, delta_exec));
		66
		67	curr->se.sum_exec_runtime += delta_exec;
		68	account_group_exec_runtime(curr, delta_exec);
		69
		70	curr->se.exec_start = rq->clock_task;
		71	cpuacct_charge(curr, delta_exec);
55	}	72	}
56		73
57	static void task_tick_stop(struct rq rq, struct task_struct curr, int queued)	74	static void task_tick_stop(struct rq rq, struct task_struct curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq rq, struct task_struct curr, int queued)
60		77
61	static void set_curr_task_stop(struct rq *rq)	78	static void set_curr_task_stop(struct rq *rq)
62	{	79	{
		80	struct task_struct *stop = rq->stop;
		81
		82	stop->se.exec_start = rq->clock_task;
63	}	83	}
64		84
65	static void switched_to_stop(struct rq rq, struct task_struct p)	85	static void switched_to_stop(struct rq rq, struct task_struct p)