aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-04-09 13:37:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-09 13:37:28 -0400
commit17b2e9bf27d417bc186cc922b4d6d5eaa048f9d8 (patch)
tree7ae99be289ec2ffe68aa38926d9e9a13e4387ee0
parent422a253483aa5de71a2bcdc27b0aa023053f97f8 (diff)
parente3c8ca8336707062f3f7cb1cd7e6b3c753baccdd (diff)
Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: do not count frozen tasks toward load sched: refresh MAINTAINERS entry sched: Print sched_group::__cpu_power in sched_domain_debug cpuacct: add per-cgroup utime/stime statistics posixtimers, sched: Fix posix clock monotonicity sched_rt: don't allocate cpumask in fastpath cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2
-rw-r--r--Documentation/cgroups/cpuacct.txt18
-rw-r--r--MAINTAINERS4
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/sched.c160
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_rt.c15
7 files changed, 178 insertions, 34 deletions
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index bb775fbe43d7..8b930946c52a 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children 30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in 31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also. 32/cgroups/cpuacct.usage also.
33
34cpuacct.stat file lists a few statistics which further divide the
35CPU time obtained by the cgroup into user and system times. Currently
36the following statistics are supported:
37
38user: Time spent by tasks of the cgroup in user mode.
39system: Time spent by tasks of the cgroup in kernel mode.
40
41user and system are in USER_HZ unit.
42
43cpuacct controller uses percpu_counter interface to collect user and
44system times. This has two side effects:
45
46- It is theoretically possible to see wrong values for user and system times.
47 This is because percpu_counter_read() on 32bit systems isn't safe
48 against concurrent writes.
49- It is possible to see slightly outdated values for user and system times
50 due to the batch processing nature of percpu_counter.
diff --git a/MAINTAINERS b/MAINTAINERS
index 1f02d96a5dbf..5d843588e1de 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3873,8 +3873,8 @@ S: Maintained
3873SCHEDULER 3873SCHEDULER
3874P: Ingo Molnar 3874P: Ingo Molnar
3875M: mingo@elte.hu 3875M: mingo@elte.hu
3876P: Robert Love [the preemptible kernel bits] 3876P: Peter Zijlstra
3877M: rml@tech9.net 3877M: peterz@infradead.org
3878L: linux-kernel@vger.kernel.org 3878L: linux-kernel@vger.kernel.org
3879S: Maintained 3879S: Maintained
3880 3880
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 98e1fe51601d..b4c38bc8049c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
205#define task_is_stopped_or_traced(task) \ 205#define task_is_stopped_or_traced(task) \
206 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 206 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
207#define task_contributes_to_load(task) \ 207#define task_contributes_to_load(task) \
208 ((task->state & TASK_UNINTERRUPTIBLE) != 0) 208 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
209 (task->flags & PF_FROZEN) == 0)
209 210
210#define __set_task_state(tsk, state_value) \ 211#define __set_task_state(tsk, state_value) \
211 do { (tsk)->state = (state_value); } while (0) 212 do { (tsk)->state = (state_value); } while (0)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bb53185d8c78..c9dcf98b4463 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
224 cpu->cpu = virt_ticks(p); 224 cpu->cpu = virt_ticks(p);
225 break; 225 break;
226 case CPUCLOCK_SCHED: 226 case CPUCLOCK_SCHED:
227 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); 227 cpu->sched = task_sched_runtime(p);
228 break; 228 break;
229 } 229 }
230 return 0; 230 return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
305{ 305{
306 struct task_cputime cputime; 306 struct task_cputime cputime;
307 307
308 thread_group_cputime(p, &cputime);
309 switch (CPUCLOCK_WHICH(which_clock)) { 308 switch (CPUCLOCK_WHICH(which_clock)) {
310 default: 309 default:
311 return -EINVAL; 310 return -EINVAL;
312 case CPUCLOCK_PROF: 311 case CPUCLOCK_PROF:
312 thread_group_cputime(p, &cputime);
313 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 313 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
314 break; 314 break;
315 case CPUCLOCK_VIRT: 315 case CPUCLOCK_VIRT:
316 thread_group_cputime(p, &cputime);
316 cpu->cpu = cputime.utime; 317 cpu->cpu = cputime.utime;
317 break; 318 break;
318 case CPUCLOCK_SCHED: 319 case CPUCLOCK_SCHED:
319 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 320 cpu->sched = thread_group_sched_runtime(p);
320 break; 321 break;
321 } 322 }
322 return 0; 323 return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d5072..5724508c3b66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1418 struct rq_iterator *iterator); 1418 struct rq_iterator *iterator);
1419#endif 1419#endif
1420 1420
1421/* Time spent by the tasks of the cpu accounting group executing in ... */
1422enum cpuacct_stat_index {
1423 CPUACCT_STAT_USER, /* ... user mode */
1424 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1425
1426 CPUACCT_STAT_NSTATS,
1427};
1428
1421#ifdef CONFIG_CGROUP_CPUACCT 1429#ifdef CONFIG_CGROUP_CPUACCT
1422static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1430static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1431static void cpuacct_update_stats(struct task_struct *tsk,
1432 enum cpuacct_stat_index idx, cputime_t val);
1423#else 1433#else
1424static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1434static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1435static inline void cpuacct_update_stats(struct task_struct *tsk,
1436 enum cpuacct_stat_index idx, cputime_t val) {}
1425#endif 1437#endif
1426 1438
1427static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1439static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4511EXPORT_PER_CPU_SYMBOL(kstat); 4523EXPORT_PER_CPU_SYMBOL(kstat);
4512 4524
4513/* 4525/*
4514 * Return any ns on the sched_clock that have not yet been banked in 4526 * Return any ns on the sched_clock that have not yet been accounted in
4515 * @p in case that task is currently running. 4527 * @p in case that task is currently running.
4528 *
4529 * Called with task_rq_lock() held on @rq.
4516 */ 4530 */
4531static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4532{
4533 u64 ns = 0;
4534
4535 if (task_current(rq, p)) {
4536 update_rq_clock(rq);
4537 ns = rq->clock - p->se.exec_start;
4538 if ((s64)ns < 0)
4539 ns = 0;
4540 }
4541
4542 return ns;
4543}
4544
4517unsigned long long task_delta_exec(struct task_struct *p) 4545unsigned long long task_delta_exec(struct task_struct *p)
4518{ 4546{
4519 unsigned long flags; 4547 unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
4521 u64 ns = 0; 4549 u64 ns = 0;
4522 4550
4523 rq = task_rq_lock(p, &flags); 4551 rq = task_rq_lock(p, &flags);
4552 ns = do_task_delta_exec(p, rq);
4553 task_rq_unlock(rq, &flags);
4524 4554
4525 if (task_current(rq, p)) { 4555 return ns;
4526 u64 delta_exec; 4556}
4527 4557
4528 update_rq_clock(rq); 4558/*
4529 delta_exec = rq->clock - p->se.exec_start; 4559 * Return accounted runtime for the task.
4530 if ((s64)delta_exec > 0) 4560 * In case the task is currently running, return the runtime plus current's
4531 ns = delta_exec; 4561 * pending runtime that have not been accounted yet.
4532 } 4562 */
4563unsigned long long task_sched_runtime(struct task_struct *p)
4564{
4565 unsigned long flags;
4566 struct rq *rq;
4567 u64 ns = 0;
4568
4569 rq = task_rq_lock(p, &flags);
4570 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4571 task_rq_unlock(rq, &flags);
4572
4573 return ns;
4574}
4575
4576/*
4577 * Return sum_exec_runtime for the thread group.
4578 * In case the task is currently running, return the sum plus current's
4579 * pending runtime that have not been accounted yet.
4580 *
4581 * Note that the thread group might have other running tasks as well,
4582 * so the return value not includes other pending runtime that other
4583 * running tasks might have.
4584 */
4585unsigned long long thread_group_sched_runtime(struct task_struct *p)
4586{
4587 struct task_cputime totals;
4588 unsigned long flags;
4589 struct rq *rq;
4590 u64 ns;
4533 4591
4592 rq = task_rq_lock(p, &flags);
4593 thread_group_cputime(p, &totals);
4594 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4534 task_rq_unlock(rq, &flags); 4595 task_rq_unlock(rq, &flags);
4535 4596
4536 return ns; 4597 return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
4559 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4620 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4560 else 4621 else
4561 cpustat->user = cputime64_add(cpustat->user, tmp); 4622 cpustat->user = cputime64_add(cpustat->user, tmp);
4623
4624 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4562 /* Account for user time used */ 4625 /* Account for user time used */
4563 acct_update_integrals(p); 4626 acct_update_integrals(p);
4564} 4627}
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4620 else 4683 else
4621 cpustat->system = cputime64_add(cpustat->system, tmp); 4684 cpustat->system = cputime64_add(cpustat->system, tmp);
4622 4685
4686 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4687
4623 /* Account for system time used */ 4688 /* Account for system time used */
4624 acct_update_integrals(p); 4689 acct_update_integrals(p);
4625} 4690}
@@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7302 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 7367 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7303 7368
7304 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7369 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7305 printk(KERN_CONT " %s", str); 7370 printk(KERN_CONT " %s (__cpu_power = %d)", str,
7371 group->__cpu_power);
7306 7372
7307 group = group->next; 7373 group = group->next;
7308 } while (group != sd->groups); 7374 } while (group != sd->groups);
@@ -9925,6 +9991,7 @@ struct cpuacct {
9925 struct cgroup_subsys_state css; 9991 struct cgroup_subsys_state css;
9926 /* cpuusage holds pointer to a u64-type object on every cpu */ 9992 /* cpuusage holds pointer to a u64-type object on every cpu */
9927 u64 *cpuusage; 9993 u64 *cpuusage;
9994 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9928 struct cpuacct *parent; 9995 struct cpuacct *parent;
9929}; 9996};
9930 9997
@@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
9949 struct cgroup_subsys *ss, struct cgroup *cgrp) 10016 struct cgroup_subsys *ss, struct cgroup *cgrp)
9950{ 10017{
9951 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 10018 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10019 int i;
9952 10020
9953 if (!ca) 10021 if (!ca)
9954 return ERR_PTR(-ENOMEM); 10022 goto out;
9955 10023
9956 ca->cpuusage = alloc_percpu(u64); 10024 ca->cpuusage = alloc_percpu(u64);
9957 if (!ca->cpuusage) { 10025 if (!ca->cpuusage)
9958 kfree(ca); 10026 goto out_free_ca;
9959 return ERR_PTR(-ENOMEM); 10027
9960 } 10028 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10029 if (percpu_counter_init(&ca->cpustat[i], 0))
10030 goto out_free_counters;
9961 10031
9962 if (cgrp->parent) 10032 if (cgrp->parent)
9963 ca->parent = cgroup_ca(cgrp->parent); 10033 ca->parent = cgroup_ca(cgrp->parent);
9964 10034
9965 return &ca->css; 10035 return &ca->css;
10036
10037out_free_counters:
10038 while (--i >= 0)
10039 percpu_counter_destroy(&ca->cpustat[i]);
10040 free_percpu(ca->cpuusage);
10041out_free_ca:
10042 kfree(ca);
10043out:
10044 return ERR_PTR(-ENOMEM);
9966} 10045}
9967 10046
9968/* destroy an existing cpu accounting group */ 10047/* destroy an existing cpu accounting group */
@@ -9970,7 +10049,10 @@ static void
9970cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 10049cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9971{ 10050{
9972 struct cpuacct *ca = cgroup_ca(cgrp); 10051 struct cpuacct *ca = cgroup_ca(cgrp);
10052 int i;
9973 10053
10054 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10055 percpu_counter_destroy(&ca->cpustat[i]);
9974 free_percpu(ca->cpuusage); 10056 free_percpu(ca->cpuusage);
9975 kfree(ca); 10057 kfree(ca);
9976} 10058}
@@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10057 return 0; 10139 return 0;
10058} 10140}
10059 10141
10142static const char *cpuacct_stat_desc[] = {
10143 [CPUACCT_STAT_USER] = "user",
10144 [CPUACCT_STAT_SYSTEM] = "system",
10145};
10146
10147static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10148 struct cgroup_map_cb *cb)
10149{
10150 struct cpuacct *ca = cgroup_ca(cgrp);
10151 int i;
10152
10153 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10154 s64 val = percpu_counter_read(&ca->cpustat[i]);
10155 val = cputime64_to_clock_t(val);
10156 cb->fill(cb, cpuacct_stat_desc[i], val);
10157 }
10158 return 0;
10159}
10160
10060static struct cftype files[] = { 10161static struct cftype files[] = {
10061 { 10162 {
10062 .name = "usage", 10163 .name = "usage",
@@ -10067,7 +10168,10 @@ static struct cftype files[] = {
10067 .name = "usage_percpu", 10168 .name = "usage_percpu",
10068 .read_seq_string = cpuacct_percpu_seq_read, 10169 .read_seq_string = cpuacct_percpu_seq_read,
10069 }, 10170 },
10070 10171 {
10172 .name = "stat",
10173 .read_map = cpuacct_stats_show,
10174 },
10071}; 10175};
10072 10176
10073static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 10177static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10089 return; 10193 return;
10090 10194
10091 cpu = task_cpu(tsk); 10195 cpu = task_cpu(tsk);
10196
10197 rcu_read_lock();
10198
10092 ca = task_ca(tsk); 10199 ca = task_ca(tsk);
10093 10200
10094 for (; ca; ca = ca->parent) { 10201 for (; ca; ca = ca->parent) {
10095 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10202 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10096 *cpuusage += cputime; 10203 *cpuusage += cputime;
10097 } 10204 }
10205
10206 rcu_read_unlock();
10207}
10208
10209/*
10210 * Charge the system/user time to the task's accounting group.
10211 */
10212static void cpuacct_update_stats(struct task_struct *tsk,
10213 enum cpuacct_stat_index idx, cputime_t val)
10214{
10215 struct cpuacct *ca;
10216
10217 if (unlikely(!cpuacct_subsys.active))
10218 return;
10219
10220 rcu_read_lock();
10221 ca = task_ca(tsk);
10222
10223 do {
10224 percpu_counter_add(&ca->cpustat[idx], val);
10225 ca = ca->parent;
10226 } while (ca);
10227 rcu_read_unlock();
10098} 10228}
10099 10229
10100struct cgroup_subsys cpuacct_subsys = { 10230struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
55 * cpupri_find - find the best (lowest-pri) CPU in the system 55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context 56 * @cp: The cpupri context
57 * @p: The task 57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs 58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 59 *
60 * Note: This function returns the recommended CPUs as calculated during the 60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 61 * current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 84 if (lowest_mask)
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
85 return 1; 86 return 1;
86 } 87 }
87 88
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
948 948
949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
950{ 950{
951 cpumask_var_t mask;
952
953 if (rq->curr->rt.nr_cpus_allowed == 1) 951 if (rq->curr->rt.nr_cpus_allowed == 1)
954 return; 952 return;
955 953
956 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
957 return;
958
959 if (p->rt.nr_cpus_allowed != 1 954 if (p->rt.nr_cpus_allowed != 1
960 && cpupri_find(&rq->rd->cpupri, p, mask)) 955 && cpupri_find(&rq->rd->cpupri, p, NULL))
961 goto free; 956 return;
962 957
963 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) 958 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
964 goto free; 959 return;
965 960
966 /* 961 /*
967 * There appears to be other cpus that can accept 962 * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
970 */ 965 */
971 requeue_task_rt(rq, p, 1); 966 requeue_task_rt(rq, p, 1);
972 resched_task(rq->curr); 967 resched_task(rq->curr);
973free:
974 free_cpumask_var(mask);
975} 968}
976 969
977#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */