Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: do not count frozen tasks toward load sched: refresh MAINTAINERS entry sched: Print sched_group::__cpu_power in sched_domain_debug cpuacct: add per-cgroup utime/stime statistics posixtimers, sched: Fix posix clock monotonicity sched_rt: don't allocate cpumask in fastpath cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-04-09 13:37:28 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-04-09 13:37:28 -0400
commit: 17b2e9bf27d417bc186cc922b4d6d5eaa048f9d8 (patch)
tree: 7ae99be289ec2ffe68aa38926d9e9a13e4387ee0
parent: 422a253483aa5de71a2bcdc27b0aa023053f97f8 (diff)
parent: e3c8ca8336707062f3f7cb1cd7e6b3c753baccdd (diff)
7 files changed, 178 insertions, 34 deletions
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index bb775fbe43d7..8b930946c52a 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
 process (bash) into it. CPU time consumed by this bash and its children
 can be obtained from g1/cpuacct.usage and the same is accumulated in
 /cgroups/cpuacct.usage also.
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+user and system are in USER_HZ unit.
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/MAINTAINERS b/MAINTAINERS
index 1f02d96a5dbf..5d843588e1de 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3873,8 +3873,8 @@ S:	Maintained
 SCHEDULER
 P:      Ingo Molnar
 M:      mingo@elte.hu
-P:      Robert Love    [the preemptible kernel bits]
+P:      Peter Zijlstra
-M:      rml@tech9.net
+M:      peterz@infradead.org
 L:      linux-kernel@vger.kernel.org
 S:      Maintained
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 98e1fe51601d..b4c38bc8049c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
 #define task_is_stopped_or_traced(task) \
                        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)  \
-                                ((task->state & TASK_UNINTERRUPTIBLE) != 0)
+                                ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
+                                 (task->flags & PF_FROZEN) == 0)
 #define __set_task_state(tsk, state_value)              \
        do { (tsk)->state = (state_value); } while (0)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bb53185d8c78..c9dcf98b4463 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                cpu->cpu = virt_ticks(p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = task_sched_runtime(p);
                break;
        }
        return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 {
        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
        switch (CPUCLOCK_WHICH(which_clock)) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
                break;
        case CPUCLOCK_VIRT:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime.utime;
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = thread_group_sched_runtime(p);
                break;
        }
        return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d5072..5724508c3b66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   struct rq_iterator *iterator);
 #endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
 * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
 */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+        u64 ns = 0;
+        if (task_current(rq, p)) {
+                update_rq_clock(rq);
+                ns = rq->clock - p->se.exec_start;
+                if ((s64)ns < 0)
+                        ns = 0;
+        }
+        return ns;
+}
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
        u64 ns = 0;
        rq = task_rq_lock(p, &flags);
+        ns = do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
-        if (task_current(rq, p)) {
+        return ns;
-                u64 delta_exec;
+}
-                update_rq_clock(rq);
+/*
-                delta_exec = rq->clock - p->se.exec_start;
+ * Return accounted runtime for the task.
-                if ((s64)delta_exec > 0)
+ * In case the task is currently running, return the runtime plus current's
-                        ns = delta_exec;
+ * pending runtime that have not been accounted yet.
-        }
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns = 0;
+        rq = task_rq_lock(p, &flags);
+        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
+        return ns;
+}
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+        struct task_cputime totals;
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns;
+        rq = task_rq_lock(p, &flags);
+        thread_group_cputime(p, &totals);
+        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
        task_rq_unlock(rq, &flags);
        return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
        /* Account for system time used */
        acct_update_integrals(p);
 }
@@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-                printk(KERN_CONT " %s", str);
+                printk(KERN_CONT " %s (__cpu_power = %d)", str,
+                                                group->__cpu_power);
                group = group->next;
        } while (group != sd->groups);
@@ -9925,6 +9991,7 @@ struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        int i;
        if (!ca)
-                return ERR_PTR(-ENOMEM);
+                goto out;
        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage) {
+        if (!ca->cpuusage)
-                kfree(ca);
+                goto out_free_ca;
-                return ERR_PTR(-ENOMEM);
-        }
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                if (percpu_counter_init(&ca->cpustat[i], 0))
+                        goto out_free_counters;
        if (cgrp->parent)
                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
+out_free_counters:
+        while (--i >= 0)
+                percpu_counter_destroy(&ca->cpustat[i]);
+        free_percpu(ca->cpuusage);
+out_free_ca:
+        kfree(ca);
+out:
+        return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
@@ -9970,7 +10049,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
        return 0;
 }
+static const char *cpuacct_stat_desc[] = {
+        [CPUACCT_STAT_USER] = "user",
+        [CPUACCT_STAT_SYSTEM] = "system",
+};
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+                s64 val = percpu_counter_read(&ca->cpustat[i]);
+                val = cputime64_to_clock_t(val);
+                cb->fill(cb, cpuacct_stat_desc[i], val);
+        }
+        return 0;
+}
 static struct cftype files[] = {
        {
                .name = "usage",
@@ -10067,7 +10168,10 @@ static struct cftype files[] = {
                .name = "usage_percpu",
                .read_seq_string = cpuacct_percpu_seq_read,
        },
+        {
+                .name = "stat",
+                .read_map = cpuacct_stats_show,
+        },
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                return;
        cpu = task_cpu(tsk);
+        rcu_read_lock();
        ca = task_ca(tsk);
        for (; ca; ca = ca->parent) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
+        rcu_read_unlock();
+}
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val)
+{
+        struct cpuacct *ca;
+        if (unlikely(!cpuacct_subsys.active))
+                return;
+        rcu_read_lock();
+        ca = task_ca(tsk);
+        do {
+                percpu_counter_add(&ca->cpustat[idx], val);
+                ca = ca->parent;
+        } while (ca);
+        rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
 * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
 * current invokation.  By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
-                cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                if (lowest_mask)
+                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
                return 1;
        }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        cpumask_var_t mask;
        if (rq->curr->rt.nr_cpus_allowed == 1)
                return;
-        if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
-                return;
        if (p->rt.nr_cpus_allowed != 1
-            && cpupri_find(&rq->rd->cpupri, p, mask))
+            && cpupri_find(&rq->rd->cpupri, p, NULL))
-                goto free;
+                return;
-        if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-                goto free;
+                return;
        /*
         * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         */
        requeue_task_rt(rq, p, 1);
        resched_task(rq->curr);
-free:
-        free_cpumask_var(mask);
 }
 #endif /* CONFIG_SMP */
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-04-09 13:37:28 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-04-09 13:37:28 -0400
commit	17b2e9bf27d417bc186cc922b4d6d5eaa048f9d8 (patch)
tree	7ae99be289ec2ffe68aa38926d9e9a13e4387ee0
parent	422a253483aa5de71a2bcdc27b0aa023053f97f8 (diff)
parent	e3c8ca8336707062f3f7cb1cd7e6b3c753baccdd (diff)

diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt index bb775fbe43d7..8b930946c52a 100644 --- a/Documentation/cgroups/cpuacct.txt +++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
30	process (bash) into it. CPU time consumed by this bash and its children	30	process (bash) into it. CPU time consumed by this bash and its children
31	can be obtained from g1/cpuacct.usage and the same is accumulated in	31	can be obtained from g1/cpuacct.usage and the same is accumulated in
32	/cgroups/cpuacct.usage also.	32	/cgroups/cpuacct.usage also.
		33
		34	cpuacct.stat file lists a few statistics which further divide the
		35	CPU time obtained by the cgroup into user and system times. Currently
		36	the following statistics are supported:
		37
		38	user: Time spent by tasks of the cgroup in user mode.
		39	system: Time spent by tasks of the cgroup in kernel mode.
		40
		41	user and system are in USER_HZ unit.
		42
		43	cpuacct controller uses percpu_counter interface to collect user and
		44	system times. This has two side effects:
		45
		46	- It is theoretically possible to see wrong values for user and system times.
		47	This is because percpu_counter_read() on 32bit systems isn't safe
		48	against concurrent writes.
		49	- It is possible to see slightly outdated values for user and system times
		50	due to the batch processing nature of percpu_counter.


diff --git a/MAINTAINERS b/MAINTAINERS index 1f02d96a5dbf..5d843588e1de 100644 --- a/MAINTAINERS +++ b/MAINTAINERS
@@ -3873,8 +3873,8 @@ S: Maintained
3873	SCHEDULER	3873	SCHEDULER
3874	P: Ingo Molnar	3874	P: Ingo Molnar
3875	M: mingo@elte.hu	3875	M: mingo@elte.hu
3876	P: Robert Love [the preemptible kernel bits]	3876	P: Peter Zijlstra
3877	M: rml@tech9.net	3877	M: peterz@infradead.org
3878	L: linux-kernel@vger.kernel.org	3878	L: linux-kernel@vger.kernel.org
3879	S: Maintained	3879	S: Maintained
3880		3880


diff --git a/include/linux/sched.h b/include/linux/sched.h index 98e1fe51601d..b4c38bc8049c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
205	#define task_is_stopped_or_traced(task) \	205	#define task_is_stopped_or_traced(task) \
206	((task->state & (__TASK_STOPPED \| __TASK_TRACED)) != 0)	206	((task->state & (__TASK_STOPPED \| __TASK_TRACED)) != 0)
207	#define task_contributes_to_load(task) \	207	#define task_contributes_to_load(task) \
208	((task->state & TASK_UNINTERRUPTIBLE) != 0)	208	((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
		209	(task->flags & PF_FROZEN) == 0)
209		210
210	#define __set_task_state(tsk, state_value) \	211	#define __set_task_state(tsk, state_value) \
211	do { (tsk)->state = (state_value); } while (0)	212	do { (tsk)->state = (state_value); } while (0)


diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bb53185d8c78..c9dcf98b4463 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
224	cpu->cpu = virt_ticks(p);	224	cpu->cpu = virt_ticks(p);
225	break;	225	break;
226	case CPUCLOCK_SCHED:	226	case CPUCLOCK_SCHED:
227	cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);	227	cpu->sched = task_sched_runtime(p);
228	break;	228	break;
229	}	229	}
230	return 0;	230	return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
305	{	305	{
306	struct task_cputime cputime;	306	struct task_cputime cputime;
307		307
308	thread_group_cputime(p, &cputime);
309	switch (CPUCLOCK_WHICH(which_clock)) {	308	switch (CPUCLOCK_WHICH(which_clock)) {
310	default:	309	default:
311	return -EINVAL;	310	return -EINVAL;
312	case CPUCLOCK_PROF:	311	case CPUCLOCK_PROF:
		312	thread_group_cputime(p, &cputime);
313	cpu->cpu = cputime_add(cputime.utime, cputime.stime);	313	cpu->cpu = cputime_add(cputime.utime, cputime.stime);
314	break;	314	break;
315	case CPUCLOCK_VIRT:	315	case CPUCLOCK_VIRT:
		316	thread_group_cputime(p, &cputime);
316	cpu->cpu = cputime.utime;	317	cpu->cpu = cputime.utime;
317	break;	318	break;
318	case CPUCLOCK_SCHED:	319	case CPUCLOCK_SCHED:
319	cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);	320	cpu->sched = thread_group_sched_runtime(p);
320	break;	321	break;
321	}	322	}
322	return 0;	323	return 0;


diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d5072..5724508c3b66 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq this_rq, int this_cpu, struct rq busiest,
1418	struct rq_iterator *iterator);	1418	struct rq_iterator *iterator);
1419	#endif	1419	#endif
1420		1420
		1421	/* Time spent by the tasks of the cpu accounting group executing in ... */
		1422	enum cpuacct_stat_index {
		1423	CPUACCT_STAT_USER, /* ... user mode */
		1424	CPUACCT_STAT_SYSTEM, /* ... kernel mode */
		1425
		1426	CPUACCT_STAT_NSTATS,
		1427	};
		1428
1421	#ifdef CONFIG_CGROUP_CPUACCT	1429	#ifdef CONFIG_CGROUP_CPUACCT
1422	static void cpuacct_charge(struct task_struct *tsk, u64 cputime);	1430	static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
		1431	static void cpuacct_update_stats(struct task_struct *tsk,
		1432	enum cpuacct_stat_index idx, cputime_t val);
1423	#else	1433	#else
1424	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}	1434	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
		1435	static inline void cpuacct_update_stats(struct task_struct *tsk,
		1436	enum cpuacct_stat_index idx, cputime_t val) {}
1425	#endif	1437	#endif
1426		1438
1427	static inline void inc_cpu_load(struct rq *rq, unsigned long load)	1439	static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4511	EXPORT_PER_CPU_SYMBOL(kstat);	4523	EXPORT_PER_CPU_SYMBOL(kstat);
4512		4524
4513	/*	4525	/*
4514	* Return any ns on the sched_clock that have not yet been banked in	4526	* Return any ns on the sched_clock that have not yet been accounted in
4515	* @p in case that task is currently running.	4527	* @p in case that task is currently running.
		4528	*
		4529	* Called with task_rq_lock() held on @rq.
4516	*/	4530	*/
		4531	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)
		4532	{
		4533	u64 ns = 0;
		4534
		4535	if (task_current(rq, p)) {
		4536	update_rq_clock(rq);
		4537	ns = rq->clock - p->se.exec_start;
		4538	if ((s64)ns < 0)
		4539	ns = 0;
		4540	}
		4541
		4542	return ns;
		4543	}
		4544
4517	unsigned long long task_delta_exec(struct task_struct *p)	4545	unsigned long long task_delta_exec(struct task_struct *p)
4518	{	4546	{
4519	unsigned long flags;	4547	unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
4521	u64 ns = 0;	4549	u64 ns = 0;
4522		4550
4523	rq = task_rq_lock(p, &flags);	4551	rq = task_rq_lock(p, &flags);
		4552	ns = do_task_delta_exec(p, rq);
		4553	task_rq_unlock(rq, &flags);
4524		4554
4525	if (task_current(rq, p)) {	4555	return ns;
4526	u64 delta_exec;	4556	}
4527		4557
4528	update_rq_clock(rq);	4558	/*
4529	delta_exec = rq->clock - p->se.exec_start;	4559	* Return accounted runtime for the task.
4530	if ((s64)delta_exec > 0)	4560	* In case the task is currently running, return the runtime plus current's
4531	ns = delta_exec;	4561	* pending runtime that have not been accounted yet.
4532	}	4562	*/
		4563	unsigned long long task_sched_runtime(struct task_struct *p)
		4564	{
		4565	unsigned long flags;
		4566	struct rq *rq;
		4567	u64 ns = 0;
		4568
		4569	rq = task_rq_lock(p, &flags);
		4570	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
		4571	task_rq_unlock(rq, &flags);
		4572
		4573	return ns;
		4574	}
		4575
		4576	/*
		4577	* Return sum_exec_runtime for the thread group.
		4578	* In case the task is currently running, return the sum plus current's
		4579	* pending runtime that have not been accounted yet.
		4580	*
		4581	* Note that the thread group might have other running tasks as well,
		4582	* so the return value not includes other pending runtime that other
		4583	* running tasks might have.
		4584	*/
		4585	unsigned long long thread_group_sched_runtime(struct task_struct *p)
		4586	{
		4587	struct task_cputime totals;
		4588	unsigned long flags;
		4589	struct rq *rq;
		4590	u64 ns;
4533		4591
		4592	rq = task_rq_lock(p, &flags);
		4593	thread_group_cputime(p, &totals);
		4594	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4534	task_rq_unlock(rq, &flags);	4595	task_rq_unlock(rq, &flags);
4535		4596
4536	return ns;	4597	return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
4559	cpustat->nice = cputime64_add(cpustat->nice, tmp);	4620	cpustat->nice = cputime64_add(cpustat->nice, tmp);
4560	else	4621	else
4561	cpustat->user = cputime64_add(cpustat->user, tmp);	4622	cpustat->user = cputime64_add(cpustat->user, tmp);
		4623
		4624	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4562	/* Account for user time used */	4625	/* Account for user time used */
4563	acct_update_integrals(p);	4626	acct_update_integrals(p);
4564	}	4627	}
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4620	else	4683	else
4621	cpustat->system = cputime64_add(cpustat->system, tmp);	4684	cpustat->system = cputime64_add(cpustat->system, tmp);
4622		4685
		4686	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
		4687
4623	/* Account for system time used */	4688	/* Account for system time used */
4624	acct_update_integrals(p);	4689	acct_update_integrals(p);
4625	}	4690	}
@@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7302	cpumask_or(groupmask, groupmask, sched_group_cpus(group));	7367	cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7303		7368
7304	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));	7369	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7305	printk(KERN_CONT " %s", str);	7370	printk(KERN_CONT " %s (__cpu_power = %d)", str,
		7371	group->__cpu_power);
7306		7372
7307	group = group->next;	7373	group = group->next;
7308	} while (group != sd->groups);	7374	} while (group != sd->groups);
@@ -9925,6 +9991,7 @@ struct cpuacct {
9925	struct cgroup_subsys_state css;	9991	struct cgroup_subsys_state css;
9926	/* cpuusage holds pointer to a u64-type object on every cpu */	9992	/* cpuusage holds pointer to a u64-type object on every cpu */
9927	u64 *cpuusage;	9993	u64 *cpuusage;
		9994	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9928	struct cpuacct *parent;	9995	struct cpuacct *parent;
9929	};	9996	};
9930		9997
@@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
9949	struct cgroup_subsys ss, struct cgroup cgrp)	10016	struct cgroup_subsys ss, struct cgroup cgrp)
9950	{	10017	{
9951	struct cpuacct ca = kzalloc(sizeof(ca), GFP_KERNEL);	10018	struct cpuacct ca = kzalloc(sizeof(ca), GFP_KERNEL);
		10019	int i;
9952		10020
9953	if (!ca)	10021	if (!ca)
9954	return ERR_PTR(-ENOMEM);	10022	goto out;
9955		10023
9956	ca->cpuusage = alloc_percpu(u64);	10024	ca->cpuusage = alloc_percpu(u64);
9957	if (!ca->cpuusage) {	10025	if (!ca->cpuusage)
9958	kfree(ca);	10026	goto out_free_ca;
9959	return ERR_PTR(-ENOMEM);	10027
9960	}	10028	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
		10029	if (percpu_counter_init(&ca->cpustat[i], 0))
		10030	goto out_free_counters;
9961		10031
9962	if (cgrp->parent)	10032	if (cgrp->parent)
9963	ca->parent = cgroup_ca(cgrp->parent);	10033	ca->parent = cgroup_ca(cgrp->parent);
9964		10034
9965	return &ca->css;	10035	return &ca->css;
		10036
		10037	out_free_counters:
		10038	while (--i >= 0)
		10039	percpu_counter_destroy(&ca->cpustat[i]);
		10040	free_percpu(ca->cpuusage);
		10041	out_free_ca:
		10042	kfree(ca);
		10043	out:
		10044	return ERR_PTR(-ENOMEM);
9966	}	10045	}
9967		10046
9968	/* destroy an existing cpu accounting group */	10047	/* destroy an existing cpu accounting group */
@@ -9970,7 +10049,10 @@ static void
9970	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	10049	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)
9971	{	10050	{
9972	struct cpuacct *ca = cgroup_ca(cgrp);	10051	struct cpuacct *ca = cgroup_ca(cgrp);
		10052	int i;
9973		10053
		10054	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
		10055	percpu_counter_destroy(&ca->cpustat[i]);
9974	free_percpu(ca->cpuusage);	10056	free_percpu(ca->cpuusage);
9975	kfree(ca);	10057	kfree(ca);
9976	}	10058	}
@@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup cgroup, struct cftype cft,
10057	return 0;	10139	return 0;
10058	}	10140	}
10059		10141
		10142	static const char *cpuacct_stat_desc[] = {
		10143	[CPUACCT_STAT_USER] = "user",
		10144	[CPUACCT_STAT_SYSTEM] = "system",
		10145	};
		10146
		10147	static int cpuacct_stats_show(struct cgroup cgrp, struct cftype cft,
		10148	struct cgroup_map_cb *cb)
		10149	{
		10150	struct cpuacct *ca = cgroup_ca(cgrp);
		10151	int i;
		10152
		10153	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
		10154	s64 val = percpu_counter_read(&ca->cpustat[i]);
		10155	val = cputime64_to_clock_t(val);
		10156	cb->fill(cb, cpuacct_stat_desc[i], val);
		10157	}
		10158	return 0;
		10159	}
		10160
10060	static struct cftype files[] = {	10161	static struct cftype files[] = {
10061	{	10162	{
10062	.name = "usage",	10163	.name = "usage",
@@ -10067,7 +10168,10 @@ static struct cftype files[] = {
10067	.name = "usage_percpu",	10168	.name = "usage_percpu",
10068	.read_seq_string = cpuacct_percpu_seq_read,	10169	.read_seq_string = cpuacct_percpu_seq_read,
10069	},	10170	},
10070		10171	{
		10172	.name = "stat",
		10173	.read_map = cpuacct_stats_show,
		10174	},
10071	};	10175	};
10072		10176
10073	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)	10177	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)
@@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10089	return;	10193	return;
10090		10194
10091	cpu = task_cpu(tsk);	10195	cpu = task_cpu(tsk);
		10196
		10197	rcu_read_lock();
		10198
10092	ca = task_ca(tsk);	10199	ca = task_ca(tsk);
10093		10200
10094	for (; ca; ca = ca->parent) {	10201	for (; ca; ca = ca->parent) {
10095	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	10202	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10096	*cpuusage += cputime;	10203	*cpuusage += cputime;
10097	}	10204	}
		10205
		10206	rcu_read_unlock();
		10207	}
		10208
		10209	/*
		10210	* Charge the system/user time to the task's accounting group.
		10211	*/
		10212	static void cpuacct_update_stats(struct task_struct *tsk,
		10213	enum cpuacct_stat_index idx, cputime_t val)
		10214	{
		10215	struct cpuacct *ca;
		10216
		10217	if (unlikely(!cpuacct_subsys.active))
		10218	return;
		10219
		10220	rcu_read_lock();
		10221	ca = task_ca(tsk);
		10222
		10223	do {
		10224	percpu_counter_add(&ca->cpustat[idx], val);
		10225	ca = ca->parent;
		10226	} while (ca);
		10227	rcu_read_unlock();
10098	}	10228	}
10099		10229
10100	struct cgroup_subsys cpuacct_subsys = {	10230	struct cgroup_subsys cpuacct_subsys = {


diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 1e00bfacf9b8..cdd3c89574cd 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
55	* cpupri_find - find the best (lowest-pri) CPU in the system	55	* cpupri_find - find the best (lowest-pri) CPU in the system
56	* @cp: The cpupri context	56	* @cp: The cpupri context
57	* @p: The task	57	* @p: The task
58	* @lowest_mask: A mask to fill in with selected CPUs	58	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59	*	59	*
60	* Note: This function returns the recommended CPUs as calculated during the	60	* Note: This function returns the recommended CPUs as calculated during the
61	* current invokation. By the time the call returns, the CPUs may have in	61	* current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri cp, struct task_struct p,
81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)	81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82	continue;	82	continue;
83		83
84	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);	84	if (lowest_mask)
		85	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
85	return 1;	86	return 1;
86	}	87	}
87		88


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 299d012b4394..f2c66f8f9712 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
948		948
949	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)	949	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
950	{	950	{
951	cpumask_var_t mask;
952
953	if (rq->curr->rt.nr_cpus_allowed == 1)	951	if (rq->curr->rt.nr_cpus_allowed == 1)
954	return;	952	return;
955		953
956	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
957	return;
958
959	if (p->rt.nr_cpus_allowed != 1	954	if (p->rt.nr_cpus_allowed != 1
960	&& cpupri_find(&rq->rd->cpupri, p, mask))	955	&& cpupri_find(&rq->rd->cpupri, p, NULL))
961	goto free;	956	return;
962		957
963	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))	958	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
964	goto free;	959	return;
965		960
966	/*	961	/*
967	* There appears to be other cpus that can accept	962	* There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
970	*/	965	*/
971	requeue_task_rt(rq, p, 1);	966	requeue_task_rt(rq, p, 1);
972	resched_task(rq->curr);	967	resched_task(rq->curr);
973	free:
974	free_cpumask_var(mask);
975	}	968	}
976		969
977	#endif /* CONFIG_SMP */	970	#endif /* CONFIG_SMP */