diff options
author | Glauber Costa <glommer@parallels.com> | 2011-11-28 11:45:19 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-12-06 14:51:21 -0500 |
commit | 54c707e98de9ca899e6552a47c797c62c45885ee (patch) | |
tree | 61ec9be62b2b3db0201aca7c8eddb5e49239baf5 /kernel | |
parent | b39e66eaf9c573f38133e894256caeaf9fd2a528 (diff) |
sched/accounting: Re-use scheduler statistics for the root cgroup
Right now, after we collect tick statistics for user and system and store them
in a well known location, we keep the same statistics again for cpuacct.
Since cpuacct is hierarchical, the numbers for the root cgroup should be
absolutely equal to the system-wide numbers.
So it would be better to just use it: this patch changes cpuacct accounting
in a way that the cpustat statistics are kept in a struct kernel_cpustat percpu
array. In the root cgroup case, we just point it to the main array. The rest of
the hierarchy walk can be totally disabled later with a static branch - but I am
not doing it here.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Tuner <pjt@google.com>
Link: http://lkml.kernel.org/r/1322498719-2255-4-git-send-email-glommer@parallels.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 165 | ||||
-rw-r--r-- | kernel/sched/sched.h | 34 |
2 files changed, 106 insertions, 93 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a727c4ea9a3e..3e078f26cb67 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2556,6 +2556,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2556 | return ns; | 2556 | return ns; |
2557 | } | 2557 | } |
2558 | 2558 | ||
2559 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2560 | struct cgroup_subsys cpuacct_subsys; | ||
2561 | struct cpuacct root_cpuacct; | ||
2562 | #endif | ||
2563 | |||
2564 | static inline void task_group_account_field(struct task_struct *p, | ||
2565 | u64 tmp, int index) | ||
2566 | { | ||
2567 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2568 | struct kernel_cpustat *kcpustat; | ||
2569 | struct cpuacct *ca; | ||
2570 | #endif | ||
2571 | /* | ||
2572 | * Since all updates are sure to touch the root cgroup, we | ||
2573 | * get ourselves ahead and touch it first. If the root cgroup | ||
2574 | * is the only cgroup, then nothing else should be necessary. | ||
2575 | * | ||
2576 | */ | ||
2577 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2578 | |||
2579 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2580 | if (unlikely(!cpuacct_subsys.active)) | ||
2581 | return; | ||
2582 | |||
2583 | rcu_read_lock(); | ||
2584 | ca = task_ca(p); | ||
2585 | while (ca && (ca != &root_cpuacct)) { | ||
2586 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2587 | kcpustat->cpustat[index] += tmp; | ||
2588 | ca = parent_ca(ca); | ||
2589 | } | ||
2590 | rcu_read_unlock(); | ||
2591 | #endif | ||
2592 | } | ||
2593 | |||
2594 | |||
2559 | /* | 2595 | /* |
2560 | * Account user cpu time to a process. | 2596 | * Account user cpu time to a process. |
2561 | * @p: the process that the cpu time gets accounted to | 2597 | * @p: the process that the cpu time gets accounted to |
@@ -2580,7 +2616,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
2580 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 2616 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
2581 | cpustat[index] += tmp; | 2617 | cpustat[index] += tmp; |
2582 | 2618 | ||
2583 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | 2619 | task_group_account_field(p, index, cputime); |
2584 | /* Account for user time used */ | 2620 | /* Account for user time used */ |
2585 | acct_update_integrals(p); | 2621 | acct_update_integrals(p); |
2586 | } | 2622 | } |
@@ -2636,7 +2672,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
2636 | 2672 | ||
2637 | /* Add system time to cpustat. */ | 2673 | /* Add system time to cpustat. */ |
2638 | cpustat[index] += tmp; | 2674 | cpustat[index] += tmp; |
2639 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | 2675 | task_group_account_field(p, index, cputime); |
2640 | 2676 | ||
2641 | /* Account for system time used */ | 2677 | /* Account for system time used */ |
2642 | acct_update_integrals(p); | 2678 | acct_update_integrals(p); |
@@ -6781,8 +6817,15 @@ void __init sched_init(void) | |||
6781 | INIT_LIST_HEAD(&root_task_group.children); | 6817 | INIT_LIST_HEAD(&root_task_group.children); |
6782 | INIT_LIST_HEAD(&root_task_group.siblings); | 6818 | INIT_LIST_HEAD(&root_task_group.siblings); |
6783 | autogroup_init(&init_task); | 6819 | autogroup_init(&init_task); |
6820 | |||
6784 | #endif /* CONFIG_CGROUP_SCHED */ | 6821 | #endif /* CONFIG_CGROUP_SCHED */ |
6785 | 6822 | ||
6823 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6824 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6825 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6826 | /* Too early, not expected to fail */ | ||
6827 | BUG_ON(!root_cpuacct.cpuusage); | ||
6828 | #endif | ||
6786 | for_each_possible_cpu(i) { | 6829 | for_each_possible_cpu(i) { |
6787 | struct rq *rq; | 6830 | struct rq *rq; |
6788 | 6831 | ||
@@ -7843,44 +7886,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7843 | * (balbir@in.ibm.com). | 7886 | * (balbir@in.ibm.com). |
7844 | */ | 7887 | */ |
7845 | 7888 | ||
7846 | /* track cpu usage of a group of tasks and its child groups */ | ||
7847 | struct cpuacct { | ||
7848 | struct cgroup_subsys_state css; | ||
7849 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
7850 | u64 __percpu *cpuusage; | ||
7851 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | ||
7852 | }; | ||
7853 | |||
7854 | struct cgroup_subsys cpuacct_subsys; | ||
7855 | |||
7856 | /* return cpu accounting group corresponding to this container */ | ||
7857 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
7858 | { | ||
7859 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
7860 | struct cpuacct, css); | ||
7861 | } | ||
7862 | |||
7863 | /* return cpu accounting group to which this task belongs */ | ||
7864 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
7865 | { | ||
7866 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
7867 | struct cpuacct, css); | ||
7868 | } | ||
7869 | |||
7870 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
7871 | { | ||
7872 | if (!ca || !ca->css.cgroup->parent) | ||
7873 | return NULL; | ||
7874 | return cgroup_ca(ca->css.cgroup->parent); | ||
7875 | } | ||
7876 | |||
7877 | /* create a new cpu accounting group */ | 7889 | /* create a new cpu accounting group */ |
7878 | static struct cgroup_subsys_state *cpuacct_create( | 7890 | static struct cgroup_subsys_state *cpuacct_create( |
7879 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 7891 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
7880 | { | 7892 | { |
7881 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7893 | struct cpuacct *ca; |
7882 | int i; | ||
7883 | 7894 | ||
7895 | if (!cgrp->parent) | ||
7896 | return &root_cpuacct.css; | ||
7897 | |||
7898 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
7884 | if (!ca) | 7899 | if (!ca) |
7885 | goto out; | 7900 | goto out; |
7886 | 7901 | ||
@@ -7888,15 +7903,13 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
7888 | if (!ca->cpuusage) | 7903 | if (!ca->cpuusage) |
7889 | goto out_free_ca; | 7904 | goto out_free_ca; |
7890 | 7905 | ||
7891 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7906 | ca->cpustat = alloc_percpu(struct kernel_cpustat); |
7892 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 7907 | if (!ca->cpustat) |
7893 | goto out_free_counters; | 7908 | goto out_free_cpuusage; |
7894 | 7909 | ||
7895 | return &ca->css; | 7910 | return &ca->css; |
7896 | 7911 | ||
7897 | out_free_counters: | 7912 | out_free_cpuusage: |
7898 | while (--i >= 0) | ||
7899 | percpu_counter_destroy(&ca->cpustat[i]); | ||
7900 | free_percpu(ca->cpuusage); | 7913 | free_percpu(ca->cpuusage); |
7901 | out_free_ca: | 7914 | out_free_ca: |
7902 | kfree(ca); | 7915 | kfree(ca); |
@@ -7909,10 +7922,8 @@ static void | |||
7909 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7922 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7910 | { | 7923 | { |
7911 | struct cpuacct *ca = cgroup_ca(cgrp); | 7924 | struct cpuacct *ca = cgroup_ca(cgrp); |
7912 | int i; | ||
7913 | 7925 | ||
7914 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7926 | free_percpu(ca->cpustat); |
7915 | percpu_counter_destroy(&ca->cpustat[i]); | ||
7916 | free_percpu(ca->cpuusage); | 7927 | free_percpu(ca->cpuusage); |
7917 | kfree(ca); | 7928 | kfree(ca); |
7918 | } | 7929 | } |
@@ -8005,16 +8016,31 @@ static const char *cpuacct_stat_desc[] = { | |||
8005 | }; | 8016 | }; |
8006 | 8017 | ||
8007 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 8018 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
8008 | struct cgroup_map_cb *cb) | 8019 | struct cgroup_map_cb *cb) |
8009 | { | 8020 | { |
8010 | struct cpuacct *ca = cgroup_ca(cgrp); | 8021 | struct cpuacct *ca = cgroup_ca(cgrp); |
8011 | int i; | 8022 | int cpu; |
8023 | s64 val = 0; | ||
8012 | 8024 | ||
8013 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 8025 | for_each_online_cpu(cpu) { |
8014 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 8026 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); |
8015 | val = cputime64_to_clock_t(val); | 8027 | val += kcpustat->cpustat[CPUTIME_USER]; |
8016 | cb->fill(cb, cpuacct_stat_desc[i], val); | 8028 | val += kcpustat->cpustat[CPUTIME_NICE]; |
8017 | } | 8029 | } |
8030 | val = cputime64_to_clock_t(val); | ||
8031 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
8032 | |||
8033 | val = 0; | ||
8034 | for_each_online_cpu(cpu) { | ||
8035 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8036 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
8037 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8038 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
8039 | } | ||
8040 | |||
8041 | val = cputime64_to_clock_t(val); | ||
8042 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8043 | |||
8018 | return 0; | 8044 | return 0; |
8019 | } | 8045 | } |
8020 | 8046 | ||
@@ -8066,45 +8092,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
8066 | rcu_read_unlock(); | 8092 | rcu_read_unlock(); |
8067 | } | 8093 | } |
8068 | 8094 | ||
8069 | /* | ||
8070 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
8071 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
8072 | * percpu_counter_add with values large enough to always overflow the | ||
8073 | * per cpu batch limit causing bad SMP scalability. | ||
8074 | * | ||
8075 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
8076 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
8077 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
8078 | */ | ||
8079 | #ifdef CONFIG_SMP | ||
8080 | #define CPUACCT_BATCH \ | ||
8081 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
8082 | #else | ||
8083 | #define CPUACCT_BATCH 0 | ||
8084 | #endif | ||
8085 | |||
8086 | /* | ||
8087 | * Charge the system/user time to the task's accounting group. | ||
8088 | */ | ||
8089 | void cpuacct_update_stats(struct task_struct *tsk, | ||
8090 | enum cpuacct_stat_index idx, cputime_t val) | ||
8091 | { | ||
8092 | struct cpuacct *ca; | ||
8093 | int batch = CPUACCT_BATCH; | ||
8094 | |||
8095 | if (unlikely(!cpuacct_subsys.active)) | ||
8096 | return; | ||
8097 | |||
8098 | rcu_read_lock(); | ||
8099 | ca = task_ca(tsk); | ||
8100 | |||
8101 | do { | ||
8102 | __percpu_counter_add(&ca->cpustat[idx], val, batch); | ||
8103 | ca = parent_ca(ca); | ||
8104 | } while (ca); | ||
8105 | rcu_read_unlock(); | ||
8106 | } | ||
8107 | |||
8108 | struct cgroup_subsys cpuacct_subsys = { | 8095 | struct cgroup_subsys cpuacct_subsys = { |
8109 | .name = "cpuacct", | 8096 | .name = "cpuacct", |
8110 | .create = cpuacct_create, | 8097 | .create = cpuacct_create, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d88545c667e3..c24801636219 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -830,13 +830,39 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
830 | extern void update_cpu_load(struct rq *this_rq); | 830 | extern void update_cpu_load(struct rq *this_rq); |
831 | 831 | ||
832 | #ifdef CONFIG_CGROUP_CPUACCT | 832 | #ifdef CONFIG_CGROUP_CPUACCT |
833 | #include <linux/cgroup.h> | ||
834 | /* track cpu usage of a group of tasks and its child groups */ | ||
835 | struct cpuacct { | ||
836 | struct cgroup_subsys_state css; | ||
837 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
838 | u64 __percpu *cpuusage; | ||
839 | struct kernel_cpustat __percpu *cpustat; | ||
840 | }; | ||
841 | |||
842 | /* return cpu accounting group corresponding to this container */ | ||
843 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
844 | { | ||
845 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
846 | struct cpuacct, css); | ||
847 | } | ||
848 | |||
849 | /* return cpu accounting group to which this task belongs */ | ||
850 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
851 | { | ||
852 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
853 | struct cpuacct, css); | ||
854 | } | ||
855 | |||
856 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
857 | { | ||
858 | if (!ca || !ca->css.cgroup->parent) | ||
859 | return NULL; | ||
860 | return cgroup_ca(ca->css.cgroup->parent); | ||
861 | } | ||
862 | |||
833 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | 863 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); |
834 | extern void cpuacct_update_stats(struct task_struct *tsk, | ||
835 | enum cpuacct_stat_index idx, cputime_t val); | ||
836 | #else | 864 | #else |
837 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 865 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
838 | static inline void cpuacct_update_stats(struct task_struct *tsk, | ||
839 | enum cpuacct_stat_index idx, cputime_t val) {} | ||
840 | #endif | 866 | #endif |
841 | 867 | ||
842 | static inline void inc_nr_running(struct rq *rq) | 868 | static inline void inc_nr_running(struct rq *rq) |