aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2011-11-28 11:45:19 -0500
committerIngo Molnar <mingo@elte.hu>2011-12-06 14:51:21 -0500
commit54c707e98de9ca899e6552a47c797c62c45885ee (patch)
tree61ec9be62b2b3db0201aca7c8eddb5e49239baf5 /kernel
parentb39e66eaf9c573f38133e894256caeaf9fd2a528 (diff)
sched/accounting: Re-use scheduler statistics for the root cgroup
Right now, after we collect tick statistics for user and system and store them in a well known location, we keep the same statistics again for cpuacct. Since cpuacct is hierarchical, the numbers for the root cgroup should be absolutely equal to the system-wide numbers. So it would be better to just use it: this patch changes cpuacct accounting in a way that the cpustat statistics are kept in a struct kernel_cpustat percpu array. In the root cgroup case, we just point it to the main array. The rest of the hierarchy walk can be totally disabled later with a static branch - but I am not doing it here. Signed-off-by: Glauber Costa <glommer@parallels.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Tuner <pjt@google.com> Link: http://lkml.kernel.org/r/1322498719-2255-4-git-send-email-glommer@parallels.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c165
-rw-r--r--kernel/sched/sched.h34
2 files changed, 106 insertions, 93 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a727c4ea9a3e..3e078f26cb67 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2556,6 +2556,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2556 return ns; 2556 return ns;
2557} 2557}
2558 2558
2559#ifdef CONFIG_CGROUP_CPUACCT
2560struct cgroup_subsys cpuacct_subsys;
2561struct cpuacct root_cpuacct;
2562#endif
2563
2564static inline void task_group_account_field(struct task_struct *p,
2565 u64 tmp, int index)
2566{
2567#ifdef CONFIG_CGROUP_CPUACCT
2568 struct kernel_cpustat *kcpustat;
2569 struct cpuacct *ca;
2570#endif
2571 /*
2572 * Since all updates are sure to touch the root cgroup, we
2573 * get ourselves ahead and touch it first. If the root cgroup
2574 * is the only cgroup, then nothing else should be necessary.
2575 *
2576 */
2577 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2578
2579#ifdef CONFIG_CGROUP_CPUACCT
2580 if (unlikely(!cpuacct_subsys.active))
2581 return;
2582
2583 rcu_read_lock();
2584 ca = task_ca(p);
2585 while (ca && (ca != &root_cpuacct)) {
2586 kcpustat = this_cpu_ptr(ca->cpustat);
2587 kcpustat->cpustat[index] += tmp;
2588 ca = parent_ca(ca);
2589 }
2590 rcu_read_unlock();
2591#endif
2592}
2593
2594
2559/* 2595/*
2560 * Account user cpu time to a process. 2596 * Account user cpu time to a process.
2561 * @p: the process that the cpu time gets accounted to 2597 * @p: the process that the cpu time gets accounted to
@@ -2580,7 +2616,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
2580 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 2616 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2581 cpustat[index] += tmp; 2617 cpustat[index] += tmp;
2582 2618
2583 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 2619 task_group_account_field(p, index, cputime);
2584 /* Account for user time used */ 2620 /* Account for user time used */
2585 acct_update_integrals(p); 2621 acct_update_integrals(p);
2586} 2622}
@@ -2636,7 +2672,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
2636 2672
2637 /* Add system time to cpustat. */ 2673 /* Add system time to cpustat. */
2638 cpustat[index] += tmp; 2674 cpustat[index] += tmp;
2639 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 2675 task_group_account_field(p, index, cputime);
2640 2676
2641 /* Account for system time used */ 2677 /* Account for system time used */
2642 acct_update_integrals(p); 2678 acct_update_integrals(p);
@@ -6781,8 +6817,15 @@ void __init sched_init(void)
6781 INIT_LIST_HEAD(&root_task_group.children); 6817 INIT_LIST_HEAD(&root_task_group.children);
6782 INIT_LIST_HEAD(&root_task_group.siblings); 6818 INIT_LIST_HEAD(&root_task_group.siblings);
6783 autogroup_init(&init_task); 6819 autogroup_init(&init_task);
6820
6784#endif /* CONFIG_CGROUP_SCHED */ 6821#endif /* CONFIG_CGROUP_SCHED */
6785 6822
6823#ifdef CONFIG_CGROUP_CPUACCT
6824 root_cpuacct.cpustat = &kernel_cpustat;
6825 root_cpuacct.cpuusage = alloc_percpu(u64);
6826 /* Too early, not expected to fail */
6827 BUG_ON(!root_cpuacct.cpuusage);
6828#endif
6786 for_each_possible_cpu(i) { 6829 for_each_possible_cpu(i) {
6787 struct rq *rq; 6830 struct rq *rq;
6788 6831
@@ -7843,44 +7886,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7843 * (balbir@in.ibm.com). 7886 * (balbir@in.ibm.com).
7844 */ 7887 */
7845 7888
7846/* track cpu usage of a group of tasks and its child groups */
7847struct cpuacct {
7848 struct cgroup_subsys_state css;
7849 /* cpuusage holds pointer to a u64-type object on every cpu */
7850 u64 __percpu *cpuusage;
7851 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
7852};
7853
7854struct cgroup_subsys cpuacct_subsys;
7855
7856/* return cpu accounting group corresponding to this container */
7857static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
7858{
7859 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
7860 struct cpuacct, css);
7861}
7862
7863/* return cpu accounting group to which this task belongs */
7864static inline struct cpuacct *task_ca(struct task_struct *tsk)
7865{
7866 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
7867 struct cpuacct, css);
7868}
7869
7870static inline struct cpuacct *parent_ca(struct cpuacct *ca)
7871{
7872 if (!ca || !ca->css.cgroup->parent)
7873 return NULL;
7874 return cgroup_ca(ca->css.cgroup->parent);
7875}
7876
7877/* create a new cpu accounting group */ 7889/* create a new cpu accounting group */
7878static struct cgroup_subsys_state *cpuacct_create( 7890static struct cgroup_subsys_state *cpuacct_create(
7879 struct cgroup_subsys *ss, struct cgroup *cgrp) 7891 struct cgroup_subsys *ss, struct cgroup *cgrp)
7880{ 7892{
7881 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7893 struct cpuacct *ca;
7882 int i;
7883 7894
7895 if (!cgrp->parent)
7896 return &root_cpuacct.css;
7897
7898 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7884 if (!ca) 7899 if (!ca)
7885 goto out; 7900 goto out;
7886 7901
@@ -7888,15 +7903,13 @@ static struct cgroup_subsys_state *cpuacct_create(
7888 if (!ca->cpuusage) 7903 if (!ca->cpuusage)
7889 goto out_free_ca; 7904 goto out_free_ca;
7890 7905
7891 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7906 ca->cpustat = alloc_percpu(struct kernel_cpustat);
7892 if (percpu_counter_init(&ca->cpustat[i], 0)) 7907 if (!ca->cpustat)
7893 goto out_free_counters; 7908 goto out_free_cpuusage;
7894 7909
7895 return &ca->css; 7910 return &ca->css;
7896 7911
7897out_free_counters: 7912out_free_cpuusage:
7898 while (--i >= 0)
7899 percpu_counter_destroy(&ca->cpustat[i]);
7900 free_percpu(ca->cpuusage); 7913 free_percpu(ca->cpuusage);
7901out_free_ca: 7914out_free_ca:
7902 kfree(ca); 7915 kfree(ca);
@@ -7909,10 +7922,8 @@ static void
7909cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7922cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7910{ 7923{
7911 struct cpuacct *ca = cgroup_ca(cgrp); 7924 struct cpuacct *ca = cgroup_ca(cgrp);
7912 int i;
7913 7925
7914 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7926 free_percpu(ca->cpustat);
7915 percpu_counter_destroy(&ca->cpustat[i]);
7916 free_percpu(ca->cpuusage); 7927 free_percpu(ca->cpuusage);
7917 kfree(ca); 7928 kfree(ca);
7918} 7929}
@@ -8005,16 +8016,31 @@ static const char *cpuacct_stat_desc[] = {
8005}; 8016};
8006 8017
8007static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8018static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8008 struct cgroup_map_cb *cb) 8019 struct cgroup_map_cb *cb)
8009{ 8020{
8010 struct cpuacct *ca = cgroup_ca(cgrp); 8021 struct cpuacct *ca = cgroup_ca(cgrp);
8011 int i; 8022 int cpu;
8023 s64 val = 0;
8012 8024
8013 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8025 for_each_online_cpu(cpu) {
8014 s64 val = percpu_counter_read(&ca->cpustat[i]); 8026 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8015 val = cputime64_to_clock_t(val); 8027 val += kcpustat->cpustat[CPUTIME_USER];
8016 cb->fill(cb, cpuacct_stat_desc[i], val); 8028 val += kcpustat->cpustat[CPUTIME_NICE];
8017 } 8029 }
8030 val = cputime64_to_clock_t(val);
8031 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8032
8033 val = 0;
8034 for_each_online_cpu(cpu) {
8035 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8036 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8037 val += kcpustat->cpustat[CPUTIME_IRQ];
8038 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8039 }
8040
8041 val = cputime64_to_clock_t(val);
8042 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8043
8018 return 0; 8044 return 0;
8019} 8045}
8020 8046
@@ -8066,45 +8092,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8066 rcu_read_unlock(); 8092 rcu_read_unlock();
8067} 8093}
8068 8094
8069/*
8070 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
8071 * in cputime_t units. As a result, cpuacct_update_stats calls
8072 * percpu_counter_add with values large enough to always overflow the
8073 * per cpu batch limit causing bad SMP scalability.
8074 *
8075 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
8076 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
8077 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
8078 */
8079#ifdef CONFIG_SMP
8080#define CPUACCT_BATCH \
8081 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
8082#else
8083#define CPUACCT_BATCH 0
8084#endif
8085
8086/*
8087 * Charge the system/user time to the task's accounting group.
8088 */
8089void cpuacct_update_stats(struct task_struct *tsk,
8090 enum cpuacct_stat_index idx, cputime_t val)
8091{
8092 struct cpuacct *ca;
8093 int batch = CPUACCT_BATCH;
8094
8095 if (unlikely(!cpuacct_subsys.active))
8096 return;
8097
8098 rcu_read_lock();
8099 ca = task_ca(tsk);
8100
8101 do {
8102 __percpu_counter_add(&ca->cpustat[idx], val, batch);
8103 ca = parent_ca(ca);
8104 } while (ca);
8105 rcu_read_unlock();
8106}
8107
8108struct cgroup_subsys cpuacct_subsys = { 8095struct cgroup_subsys cpuacct_subsys = {
8109 .name = "cpuacct", 8096 .name = "cpuacct",
8110 .create = cpuacct_create, 8097 .create = cpuacct_create,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d88545c667e3..c24801636219 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -830,13 +830,39 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
830extern void update_cpu_load(struct rq *this_rq); 830extern void update_cpu_load(struct rq *this_rq);
831 831
832#ifdef CONFIG_CGROUP_CPUACCT 832#ifdef CONFIG_CGROUP_CPUACCT
833#include <linux/cgroup.h>
834/* track cpu usage of a group of tasks and its child groups */
835struct cpuacct {
836 struct cgroup_subsys_state css;
837 /* cpuusage holds pointer to a u64-type object on every cpu */
838 u64 __percpu *cpuusage;
839 struct kernel_cpustat __percpu *cpustat;
840};
841
842/* return cpu accounting group corresponding to this container */
843static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
844{
845 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
846 struct cpuacct, css);
847}
848
849/* return cpu accounting group to which this task belongs */
850static inline struct cpuacct *task_ca(struct task_struct *tsk)
851{
852 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
853 struct cpuacct, css);
854}
855
856static inline struct cpuacct *parent_ca(struct cpuacct *ca)
857{
858 if (!ca || !ca->css.cgroup->parent)
859 return NULL;
860 return cgroup_ca(ca->css.cgroup->parent);
861}
862
833extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 863extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
834extern void cpuacct_update_stats(struct task_struct *tsk,
835 enum cpuacct_stat_index idx, cputime_t val);
836#else 864#else
837static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 865static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
838static inline void cpuacct_update_stats(struct task_struct *tsk,
839 enum cpuacct_stat_index idx, cputime_t val) {}
840#endif 866#endif
841 867
842static inline void inc_nr_running(struct rq *rq) 868static inline void inc_nr_running(struct rq *rq)