diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 254 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 296 | ||||
-rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 214 | ||||
-rw-r--r-- | kernel/sched/fair.c | 148 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 16 | ||||
-rw-r--r-- | kernel/sched/sched.h | 219 |
8 files changed, 718 insertions, 447 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8285eb0cde6..ebdb19541218 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1288,8 +1288,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | |||
1288 | static void | 1288 | static void |
1289 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | 1289 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) |
1290 | { | 1290 | { |
1291 | trace_sched_wakeup(p, true); | ||
1292 | check_preempt_curr(rq, p, wake_flags); | 1291 | check_preempt_curr(rq, p, wake_flags); |
1292 | trace_sched_wakeup(p, true); | ||
1293 | 1293 | ||
1294 | p->state = TASK_RUNNING; | 1294 | p->state = TASK_RUNNING; |
1295 | #ifdef CONFIG_SMP | 1295 | #ifdef CONFIG_SMP |
@@ -3039,11 +3039,13 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3039 | asmlinkage void __sched preempt_schedule_irq(void) | 3039 | asmlinkage void __sched preempt_schedule_irq(void) |
3040 | { | 3040 | { |
3041 | struct thread_info *ti = current_thread_info(); | 3041 | struct thread_info *ti = current_thread_info(); |
3042 | enum ctx_state prev_state; | ||
3042 | 3043 | ||
3043 | /* Catch callers which need to be fixed */ | 3044 | /* Catch callers which need to be fixed */ |
3044 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3045 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3045 | 3046 | ||
3046 | user_exit(); | 3047 | prev_state = exception_enter(); |
3048 | |||
3047 | do { | 3049 | do { |
3048 | add_preempt_count(PREEMPT_ACTIVE); | 3050 | add_preempt_count(PREEMPT_ACTIVE); |
3049 | local_irq_enable(); | 3051 | local_irq_enable(); |
@@ -3057,6 +3059,8 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3057 | */ | 3059 | */ |
3058 | barrier(); | 3060 | barrier(); |
3059 | } while (need_resched()); | 3061 | } while (need_resched()); |
3062 | |||
3063 | exception_exit(prev_state); | ||
3060 | } | 3064 | } |
3061 | 3065 | ||
3062 | #endif /* CONFIG_PREEMPT */ | 3066 | #endif /* CONFIG_PREEMPT */ |
@@ -6204,7 +6208,7 @@ static void sched_init_numa(void) | |||
6204 | * 'level' contains the number of unique distances, excluding the | 6208 | * 'level' contains the number of unique distances, excluding the |
6205 | * identity distance node_distance(i,i). | 6209 | * identity distance node_distance(i,i). |
6206 | * | 6210 | * |
6207 | * The sched_domains_nume_distance[] array includes the actual distance | 6211 | * The sched_domains_numa_distance[] array includes the actual distance |
6208 | * numbers. | 6212 | * numbers. |
6209 | */ | 6213 | */ |
6210 | 6214 | ||
@@ -6817,11 +6821,15 @@ int in_sched_functions(unsigned long addr) | |||
6817 | } | 6821 | } |
6818 | 6822 | ||
6819 | #ifdef CONFIG_CGROUP_SCHED | 6823 | #ifdef CONFIG_CGROUP_SCHED |
6824 | /* | ||
6825 | * Default task group. | ||
6826 | * Every task in system belongs to this group at bootup. | ||
6827 | */ | ||
6820 | struct task_group root_task_group; | 6828 | struct task_group root_task_group; |
6821 | LIST_HEAD(task_groups); | 6829 | LIST_HEAD(task_groups); |
6822 | #endif | 6830 | #endif |
6823 | 6831 | ||
6824 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 6832 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
6825 | 6833 | ||
6826 | void __init sched_init(void) | 6834 | void __init sched_init(void) |
6827 | { | 6835 | { |
@@ -6858,7 +6866,7 @@ void __init sched_init(void) | |||
6858 | #endif /* CONFIG_RT_GROUP_SCHED */ | 6866 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6859 | #ifdef CONFIG_CPUMASK_OFFSTACK | 6867 | #ifdef CONFIG_CPUMASK_OFFSTACK |
6860 | for_each_possible_cpu(i) { | 6868 | for_each_possible_cpu(i) { |
6861 | per_cpu(load_balance_tmpmask, i) = (void *)ptr; | 6869 | per_cpu(load_balance_mask, i) = (void *)ptr; |
6862 | ptr += cpumask_size(); | 6870 | ptr += cpumask_size(); |
6863 | } | 6871 | } |
6864 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6872 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
@@ -6884,12 +6892,6 @@ void __init sched_init(void) | |||
6884 | 6892 | ||
6885 | #endif /* CONFIG_CGROUP_SCHED */ | 6893 | #endif /* CONFIG_CGROUP_SCHED */ |
6886 | 6894 | ||
6887 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6888 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6889 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6890 | /* Too early, not expected to fail */ | ||
6891 | BUG_ON(!root_cpuacct.cpuusage); | ||
6892 | #endif | ||
6893 | for_each_possible_cpu(i) { | 6895 | for_each_possible_cpu(i) { |
6894 | struct rq *rq; | 6896 | struct rq *rq; |
6895 | 6897 | ||
@@ -7411,7 +7413,7 @@ unlock: | |||
7411 | return err; | 7413 | return err; |
7412 | } | 7414 | } |
7413 | 7415 | ||
7414 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 7416 | static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
7415 | { | 7417 | { |
7416 | u64 rt_runtime, rt_period; | 7418 | u64 rt_runtime, rt_period; |
7417 | 7419 | ||
@@ -7423,7 +7425,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7423 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7425 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
7424 | } | 7426 | } |
7425 | 7427 | ||
7426 | long sched_group_rt_runtime(struct task_group *tg) | 7428 | static long sched_group_rt_runtime(struct task_group *tg) |
7427 | { | 7429 | { |
7428 | u64 rt_runtime_us; | 7430 | u64 rt_runtime_us; |
7429 | 7431 | ||
@@ -7435,7 +7437,7 @@ long sched_group_rt_runtime(struct task_group *tg) | |||
7435 | return rt_runtime_us; | 7437 | return rt_runtime_us; |
7436 | } | 7438 | } |
7437 | 7439 | ||
7438 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | 7440 | static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) |
7439 | { | 7441 | { |
7440 | u64 rt_runtime, rt_period; | 7442 | u64 rt_runtime, rt_period; |
7441 | 7443 | ||
@@ -7448,7 +7450,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
7448 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7450 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
7449 | } | 7451 | } |
7450 | 7452 | ||
7451 | long sched_group_rt_period(struct task_group *tg) | 7453 | static long sched_group_rt_period(struct task_group *tg) |
7452 | { | 7454 | { |
7453 | u64 rt_period_us; | 7455 | u64 rt_period_us; |
7454 | 7456 | ||
@@ -7483,7 +7485,7 @@ static int sched_rt_global_constraints(void) | |||
7483 | return ret; | 7485 | return ret; |
7484 | } | 7486 | } |
7485 | 7487 | ||
7486 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | 7488 | static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) |
7487 | { | 7489 | { |
7488 | /* Don't accept realtime tasks when there is no way for them to run */ | 7490 | /* Don't accept realtime tasks when there is no way for them to run */ |
7489 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | 7491 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) |
@@ -7991,226 +7993,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7991 | 7993 | ||
7992 | #endif /* CONFIG_CGROUP_SCHED */ | 7994 | #endif /* CONFIG_CGROUP_SCHED */ |
7993 | 7995 | ||
7994 | #ifdef CONFIG_CGROUP_CPUACCT | ||
7995 | |||
7996 | /* | ||
7997 | * CPU accounting code for task groups. | ||
7998 | * | ||
7999 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
8000 | * (balbir@in.ibm.com). | ||
8001 | */ | ||
8002 | |||
8003 | struct cpuacct root_cpuacct; | ||
8004 | |||
8005 | /* create a new cpu accounting group */ | ||
8006 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
8007 | { | ||
8008 | struct cpuacct *ca; | ||
8009 | |||
8010 | if (!cgrp->parent) | ||
8011 | return &root_cpuacct.css; | ||
8012 | |||
8013 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
8014 | if (!ca) | ||
8015 | goto out; | ||
8016 | |||
8017 | ca->cpuusage = alloc_percpu(u64); | ||
8018 | if (!ca->cpuusage) | ||
8019 | goto out_free_ca; | ||
8020 | |||
8021 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
8022 | if (!ca->cpustat) | ||
8023 | goto out_free_cpuusage; | ||
8024 | |||
8025 | return &ca->css; | ||
8026 | |||
8027 | out_free_cpuusage: | ||
8028 | free_percpu(ca->cpuusage); | ||
8029 | out_free_ca: | ||
8030 | kfree(ca); | ||
8031 | out: | ||
8032 | return ERR_PTR(-ENOMEM); | ||
8033 | } | ||
8034 | |||
8035 | /* destroy an existing cpu accounting group */ | ||
8036 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
8037 | { | ||
8038 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8039 | |||
8040 | free_percpu(ca->cpustat); | ||
8041 | free_percpu(ca->cpuusage); | ||
8042 | kfree(ca); | ||
8043 | } | ||
8044 | |||
8045 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
8046 | { | ||
8047 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8048 | u64 data; | ||
8049 | |||
8050 | #ifndef CONFIG_64BIT | ||
8051 | /* | ||
8052 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
8053 | */ | ||
8054 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8055 | data = *cpuusage; | ||
8056 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8057 | #else | ||
8058 | data = *cpuusage; | ||
8059 | #endif | ||
8060 | |||
8061 | return data; | ||
8062 | } | ||
8063 | |||
8064 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
8065 | { | ||
8066 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8067 | |||
8068 | #ifndef CONFIG_64BIT | ||
8069 | /* | ||
8070 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
8071 | */ | ||
8072 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8073 | *cpuusage = val; | ||
8074 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8075 | #else | ||
8076 | *cpuusage = val; | ||
8077 | #endif | ||
8078 | } | ||
8079 | |||
8080 | /* return total cpu usage (in nanoseconds) of a group */ | ||
8081 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
8082 | { | ||
8083 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8084 | u64 totalcpuusage = 0; | ||
8085 | int i; | ||
8086 | |||
8087 | for_each_present_cpu(i) | ||
8088 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
8089 | |||
8090 | return totalcpuusage; | ||
8091 | } | ||
8092 | |||
8093 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
8094 | u64 reset) | ||
8095 | { | ||
8096 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8097 | int err = 0; | ||
8098 | int i; | ||
8099 | |||
8100 | if (reset) { | ||
8101 | err = -EINVAL; | ||
8102 | goto out; | ||
8103 | } | ||
8104 | |||
8105 | for_each_present_cpu(i) | ||
8106 | cpuacct_cpuusage_write(ca, i, 0); | ||
8107 | |||
8108 | out: | ||
8109 | return err; | ||
8110 | } | ||
8111 | |||
8112 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
8113 | struct seq_file *m) | ||
8114 | { | ||
8115 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
8116 | u64 percpu; | ||
8117 | int i; | ||
8118 | |||
8119 | for_each_present_cpu(i) { | ||
8120 | percpu = cpuacct_cpuusage_read(ca, i); | ||
8121 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
8122 | } | ||
8123 | seq_printf(m, "\n"); | ||
8124 | return 0; | ||
8125 | } | ||
8126 | |||
8127 | static const char *cpuacct_stat_desc[] = { | ||
8128 | [CPUACCT_STAT_USER] = "user", | ||
8129 | [CPUACCT_STAT_SYSTEM] = "system", | ||
8130 | }; | ||
8131 | |||
8132 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
8133 | struct cgroup_map_cb *cb) | ||
8134 | { | ||
8135 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8136 | int cpu; | ||
8137 | s64 val = 0; | ||
8138 | |||
8139 | for_each_online_cpu(cpu) { | ||
8140 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8141 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
8142 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
8143 | } | ||
8144 | val = cputime64_to_clock_t(val); | ||
8145 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
8146 | |||
8147 | val = 0; | ||
8148 | for_each_online_cpu(cpu) { | ||
8149 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8150 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
8151 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8152 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
8153 | } | ||
8154 | |||
8155 | val = cputime64_to_clock_t(val); | ||
8156 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8157 | |||
8158 | return 0; | ||
8159 | } | ||
8160 | |||
8161 | static struct cftype files[] = { | ||
8162 | { | ||
8163 | .name = "usage", | ||
8164 | .read_u64 = cpuusage_read, | ||
8165 | .write_u64 = cpuusage_write, | ||
8166 | }, | ||
8167 | { | ||
8168 | .name = "usage_percpu", | ||
8169 | .read_seq_string = cpuacct_percpu_seq_read, | ||
8170 | }, | ||
8171 | { | ||
8172 | .name = "stat", | ||
8173 | .read_map = cpuacct_stats_show, | ||
8174 | }, | ||
8175 | { } /* terminate */ | ||
8176 | }; | ||
8177 | |||
8178 | /* | ||
8179 | * charge this task's execution time to its accounting group. | ||
8180 | * | ||
8181 | * called with rq->lock held. | ||
8182 | */ | ||
8183 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
8184 | { | ||
8185 | struct cpuacct *ca; | ||
8186 | int cpu; | ||
8187 | |||
8188 | if (unlikely(!cpuacct_subsys.active)) | ||
8189 | return; | ||
8190 | |||
8191 | cpu = task_cpu(tsk); | ||
8192 | |||
8193 | rcu_read_lock(); | ||
8194 | |||
8195 | ca = task_ca(tsk); | ||
8196 | |||
8197 | for (; ca; ca = parent_ca(ca)) { | ||
8198 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8199 | *cpuusage += cputime; | ||
8200 | } | ||
8201 | |||
8202 | rcu_read_unlock(); | ||
8203 | } | ||
8204 | |||
8205 | struct cgroup_subsys cpuacct_subsys = { | ||
8206 | .name = "cpuacct", | ||
8207 | .css_alloc = cpuacct_css_alloc, | ||
8208 | .css_free = cpuacct_css_free, | ||
8209 | .subsys_id = cpuacct_subsys_id, | ||
8210 | .base_cftypes = files, | ||
8211 | }; | ||
8212 | #endif /* CONFIG_CGROUP_CPUACCT */ | ||
8213 | |||
8214 | void dump_cpu_task(int cpu) | 7996 | void dump_cpu_task(int cpu) |
8215 | { | 7997 | { |
8216 | pr_info("Task dump for CPU %d:\n", cpu); | 7998 | pr_info("Task dump for CPU %d:\n", cpu); |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c | |||
@@ -0,0 +1,296 @@ | |||
1 | #include <linux/cgroup.h> | ||
2 | #include <linux/slab.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/cpumask.h> | ||
6 | #include <linux/seq_file.h> | ||
7 | #include <linux/rcupdate.h> | ||
8 | #include <linux/kernel_stat.h> | ||
9 | #include <linux/err.h> | ||
10 | |||
11 | #include "sched.h" | ||
12 | |||
13 | /* | ||
14 | * CPU accounting code for task groups. | ||
15 | * | ||
16 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
17 | * (balbir@in.ibm.com). | ||
18 | */ | ||
19 | |||
20 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
21 | enum cpuacct_stat_index { | ||
22 | CPUACCT_STAT_USER, /* ... user mode */ | ||
23 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
24 | |||
25 | CPUACCT_STAT_NSTATS, | ||
26 | }; | ||
27 | |||
28 | /* track cpu usage of a group of tasks and its child groups */ | ||
29 | struct cpuacct { | ||
30 | struct cgroup_subsys_state css; | ||
31 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
32 | u64 __percpu *cpuusage; | ||
33 | struct kernel_cpustat __percpu *cpustat; | ||
34 | }; | ||
35 | |||
36 | /* return cpu accounting group corresponding to this container */ | ||
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | ||
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
40 | struct cpuacct, css); | ||
41 | } | ||
42 | |||
43 | /* return cpu accounting group to which this task belongs */ | ||
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
45 | { | ||
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | ||
54 | |||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
56 | { | ||
57 | if (!ca->css.cgroup->parent) | ||
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | ||
61 | |||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | ||
63 | static struct cpuacct root_cpuacct = { | ||
64 | .cpustat = &kernel_cpustat, | ||
65 | .cpuusage = &root_cpuacct_cpuusage, | ||
66 | }; | ||
67 | |||
68 | /* create a new cpu accounting group */ | ||
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
70 | { | ||
71 | struct cpuacct *ca; | ||
72 | |||
73 | if (!cgrp->parent) | ||
74 | return &root_cpuacct.css; | ||
75 | |||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
77 | if (!ca) | ||
78 | goto out; | ||
79 | |||
80 | ca->cpuusage = alloc_percpu(u64); | ||
81 | if (!ca->cpuusage) | ||
82 | goto out_free_ca; | ||
83 | |||
84 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
85 | if (!ca->cpustat) | ||
86 | goto out_free_cpuusage; | ||
87 | |||
88 | return &ca->css; | ||
89 | |||
90 | out_free_cpuusage: | ||
91 | free_percpu(ca->cpuusage); | ||
92 | out_free_ca: | ||
93 | kfree(ca); | ||
94 | out: | ||
95 | return ERR_PTR(-ENOMEM); | ||
96 | } | ||
97 | |||
98 | /* destroy an existing cpu accounting group */ | ||
99 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
100 | { | ||
101 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
102 | |||
103 | free_percpu(ca->cpustat); | ||
104 | free_percpu(ca->cpuusage); | ||
105 | kfree(ca); | ||
106 | } | ||
107 | |||
108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
109 | { | ||
110 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
111 | u64 data; | ||
112 | |||
113 | #ifndef CONFIG_64BIT | ||
114 | /* | ||
115 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
116 | */ | ||
117 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
118 | data = *cpuusage; | ||
119 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
120 | #else | ||
121 | data = *cpuusage; | ||
122 | #endif | ||
123 | |||
124 | return data; | ||
125 | } | ||
126 | |||
127 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
128 | { | ||
129 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
130 | |||
131 | #ifndef CONFIG_64BIT | ||
132 | /* | ||
133 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
134 | */ | ||
135 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
136 | *cpuusage = val; | ||
137 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
138 | #else | ||
139 | *cpuusage = val; | ||
140 | #endif | ||
141 | } | ||
142 | |||
143 | /* return total cpu usage (in nanoseconds) of a group */ | ||
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
145 | { | ||
146 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
147 | u64 totalcpuusage = 0; | ||
148 | int i; | ||
149 | |||
150 | for_each_present_cpu(i) | ||
151 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
152 | |||
153 | return totalcpuusage; | ||
154 | } | ||
155 | |||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
157 | u64 reset) | ||
158 | { | ||
159 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
160 | int err = 0; | ||
161 | int i; | ||
162 | |||
163 | if (reset) { | ||
164 | err = -EINVAL; | ||
165 | goto out; | ||
166 | } | ||
167 | |||
168 | for_each_present_cpu(i) | ||
169 | cpuacct_cpuusage_write(ca, i, 0); | ||
170 | |||
171 | out: | ||
172 | return err; | ||
173 | } | ||
174 | |||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
176 | struct seq_file *m) | ||
177 | { | ||
178 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
179 | u64 percpu; | ||
180 | int i; | ||
181 | |||
182 | for_each_present_cpu(i) { | ||
183 | percpu = cpuacct_cpuusage_read(ca, i); | ||
184 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
185 | } | ||
186 | seq_printf(m, "\n"); | ||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static const char * const cpuacct_stat_desc[] = { | ||
191 | [CPUACCT_STAT_USER] = "user", | ||
192 | [CPUACCT_STAT_SYSTEM] = "system", | ||
193 | }; | ||
194 | |||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
196 | struct cgroup_map_cb *cb) | ||
197 | { | ||
198 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
199 | int cpu; | ||
200 | s64 val = 0; | ||
201 | |||
202 | for_each_online_cpu(cpu) { | ||
203 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
204 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
205 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
206 | } | ||
207 | val = cputime64_to_clock_t(val); | ||
208 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
209 | |||
210 | val = 0; | ||
211 | for_each_online_cpu(cpu) { | ||
212 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
213 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
214 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
215 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
216 | } | ||
217 | |||
218 | val = cputime64_to_clock_t(val); | ||
219 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
220 | |||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static struct cftype files[] = { | ||
225 | { | ||
226 | .name = "usage", | ||
227 | .read_u64 = cpuusage_read, | ||
228 | .write_u64 = cpuusage_write, | ||
229 | }, | ||
230 | { | ||
231 | .name = "usage_percpu", | ||
232 | .read_seq_string = cpuacct_percpu_seq_read, | ||
233 | }, | ||
234 | { | ||
235 | .name = "stat", | ||
236 | .read_map = cpuacct_stats_show, | ||
237 | }, | ||
238 | { } /* terminate */ | ||
239 | }; | ||
240 | |||
241 | /* | ||
242 | * charge this task's execution time to its accounting group. | ||
243 | * | ||
244 | * called with rq->lock held. | ||
245 | */ | ||
246 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
247 | { | ||
248 | struct cpuacct *ca; | ||
249 | int cpu; | ||
250 | |||
251 | cpu = task_cpu(tsk); | ||
252 | |||
253 | rcu_read_lock(); | ||
254 | |||
255 | ca = task_ca(tsk); | ||
256 | |||
257 | while (true) { | ||
258 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
259 | *cpuusage += cputime; | ||
260 | |||
261 | ca = parent_ca(ca); | ||
262 | if (!ca) | ||
263 | break; | ||
264 | } | ||
265 | |||
266 | rcu_read_unlock(); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Add user/system time to cpuacct. | ||
271 | * | ||
272 | * Note: it's the caller that updates the account of the root cgroup. | ||
273 | */ | ||
274 | void cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
275 | { | ||
276 | struct kernel_cpustat *kcpustat; | ||
277 | struct cpuacct *ca; | ||
278 | |||
279 | rcu_read_lock(); | ||
280 | ca = task_ca(p); | ||
281 | while (ca != &root_cpuacct) { | ||
282 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
283 | kcpustat->cpustat[index] += val; | ||
284 | ca = __parent_ca(ca); | ||
285 | } | ||
286 | rcu_read_unlock(); | ||
287 | } | ||
288 | |||
289 | struct cgroup_subsys cpuacct_subsys = { | ||
290 | .name = "cpuacct", | ||
291 | .css_alloc = cpuacct_css_alloc, | ||
292 | .css_free = cpuacct_css_free, | ||
293 | .subsys_id = cpuacct_subsys_id, | ||
294 | .base_cftypes = files, | ||
295 | .early_init = 1, | ||
296 | }; | ||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h | |||
@@ -0,0 +1,17 @@ | |||
1 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2 | |||
3 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
4 | extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); | ||
5 | |||
6 | #else | ||
7 | |||
8 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
9 | { | ||
10 | } | ||
11 | |||
12 | static inline void | ||
13 | cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | #endif | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e93cca92f38b..ea32f02bf2c3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) | |||
115 | static inline void task_group_account_field(struct task_struct *p, int index, | 115 | static inline void task_group_account_field(struct task_struct *p, int index, |
116 | u64 tmp) | 116 | u64 tmp) |
117 | { | 117 | { |
118 | #ifdef CONFIG_CGROUP_CPUACCT | ||
119 | struct kernel_cpustat *kcpustat; | ||
120 | struct cpuacct *ca; | ||
121 | #endif | ||
122 | /* | 118 | /* |
123 | * Since all updates are sure to touch the root cgroup, we | 119 | * Since all updates are sure to touch the root cgroup, we |
124 | * get ourselves ahead and touch it first. If the root cgroup | 120 | * get ourselves ahead and touch it first. If the root cgroup |
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
127 | */ | 123 | */ |
128 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; |
129 | 125 | ||
130 | #ifdef CONFIG_CGROUP_CPUACCT | 126 | cpuacct_account_field(p, index, tmp); |
131 | if (unlikely(!cpuacct_subsys.active)) | ||
132 | return; | ||
133 | |||
134 | rcu_read_lock(); | ||
135 | ca = task_ca(p); | ||
136 | while (ca && (ca != &root_cpuacct)) { | ||
137 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
138 | kcpustat->cpustat[index] += tmp; | ||
139 | ca = parent_ca(ca); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | #endif | ||
143 | } | 127 | } |
144 | 128 | ||
145 | /* | 129 | /* |
@@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
388 | struct rq *rq) {} | 372 | struct rq *rq) {} |
389 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 373 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
390 | 374 | ||
391 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
392 | /* | ||
393 | * Account a single tick of cpu time. | ||
394 | * @p: the process that the cpu time gets accounted to | ||
395 | * @user_tick: indicates if the tick is a user or a system tick | ||
396 | */ | ||
397 | void account_process_tick(struct task_struct *p, int user_tick) | ||
398 | { | ||
399 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
400 | struct rq *rq = this_rq(); | ||
401 | |||
402 | if (vtime_accounting_enabled()) | ||
403 | return; | ||
404 | |||
405 | if (sched_clock_irqtime) { | ||
406 | irqtime_account_process_tick(p, user_tick, rq); | ||
407 | return; | ||
408 | } | ||
409 | |||
410 | if (steal_account_process_tick()) | ||
411 | return; | ||
412 | |||
413 | if (user_tick) | ||
414 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
415 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
416 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
417 | one_jiffy_scaled); | ||
418 | else | ||
419 | account_idle_time(cputime_one_jiffy); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * Account multiple ticks of steal time. | ||
424 | * @p: the process from which the cpu time has been stolen | ||
425 | * @ticks: number of stolen ticks | ||
426 | */ | ||
427 | void account_steal_ticks(unsigned long ticks) | ||
428 | { | ||
429 | account_steal_time(jiffies_to_cputime(ticks)); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Account multiple ticks of idle time. | ||
434 | * @ticks: number of stolen ticks | ||
435 | */ | ||
436 | void account_idle_ticks(unsigned long ticks) | ||
437 | { | ||
438 | |||
439 | if (sched_clock_irqtime) { | ||
440 | irqtime_account_idle_ticks(ticks); | ||
441 | return; | ||
442 | } | ||
443 | |||
444 | account_idle_time(jiffies_to_cputime(ticks)); | ||
445 | } | ||
446 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | ||
447 | |||
448 | /* | 375 | /* |
449 | * Use precise platform statistics if available: | 376 | * Use precise platform statistics if available: |
450 | */ | 377 | */ |
451 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
452 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
453 | { | ||
454 | *ut = p->utime; | ||
455 | *st = p->stime; | ||
456 | } | ||
457 | |||
458 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
459 | { | ||
460 | struct task_cputime cputime; | ||
461 | |||
462 | thread_group_cputime(p, &cputime); | ||
463 | |||
464 | *ut = cputime.utime; | ||
465 | *st = cputime.stime; | ||
466 | } | ||
467 | 379 | ||
468 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
469 | void vtime_task_switch(struct task_struct *prev) | 381 | void vtime_task_switch(struct task_struct *prev) |
@@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
518 | } | 430 | } |
519 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
520 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
434 | |||
435 | |||
436 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
437 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
438 | { | ||
439 | *ut = p->utime; | ||
440 | *st = p->stime; | ||
441 | } | ||
521 | 442 | ||
522 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 443 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
444 | { | ||
445 | struct task_cputime cputime; | ||
523 | 446 | ||
524 | static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) | 447 | thread_group_cputime(p, &cputime); |
448 | |||
449 | *ut = cputime.utime; | ||
450 | *st = cputime.stime; | ||
451 | } | ||
452 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | ||
453 | /* | ||
454 | * Account a single tick of cpu time. | ||
455 | * @p: the process that the cpu time gets accounted to | ||
456 | * @user_tick: indicates if the tick is a user or a system tick | ||
457 | */ | ||
458 | void account_process_tick(struct task_struct *p, int user_tick) | ||
525 | { | 459 | { |
526 | u64 temp = (__force u64) rtime; | 460 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
461 | struct rq *rq = this_rq(); | ||
527 | 462 | ||
528 | temp *= (__force u64) stime; | 463 | if (vtime_accounting_enabled()) |
464 | return; | ||
465 | |||
466 | if (sched_clock_irqtime) { | ||
467 | irqtime_account_process_tick(p, user_tick, rq); | ||
468 | return; | ||
469 | } | ||
470 | |||
471 | if (steal_account_process_tick()) | ||
472 | return; | ||
529 | 473 | ||
530 | if (sizeof(cputime_t) == 4) | 474 | if (user_tick) |
531 | temp = div_u64(temp, (__force u32) total); | 475 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
476 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
477 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
478 | one_jiffy_scaled); | ||
532 | else | 479 | else |
533 | temp = div64_u64(temp, (__force u64) total); | 480 | account_idle_time(cputime_one_jiffy); |
481 | } | ||
534 | 482 | ||
535 | return (__force cputime_t) temp; | 483 | /* |
484 | * Account multiple ticks of steal time. | ||
485 | * @p: the process from which the cpu time has been stolen | ||
486 | * @ticks: number of stolen ticks | ||
487 | */ | ||
488 | void account_steal_ticks(unsigned long ticks) | ||
489 | { | ||
490 | account_steal_time(jiffies_to_cputime(ticks)); | ||
491 | } | ||
492 | |||
493 | /* | ||
494 | * Account multiple ticks of idle time. | ||
495 | * @ticks: number of stolen ticks | ||
496 | */ | ||
497 | void account_idle_ticks(unsigned long ticks) | ||
498 | { | ||
499 | |||
500 | if (sched_clock_irqtime) { | ||
501 | irqtime_account_idle_ticks(ticks); | ||
502 | return; | ||
503 | } | ||
504 | |||
505 | account_idle_time(jiffies_to_cputime(ticks)); | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * Perform (stime * rtime) / total with reduced chances | ||
510 | * of multiplication overflows by using smaller factors | ||
511 | * like quotient and remainders of divisions between | ||
512 | * rtime and total. | ||
513 | */ | ||
514 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | ||
515 | { | ||
516 | u64 rem, res, scaled; | ||
517 | |||
518 | if (rtime >= total) { | ||
519 | /* | ||
520 | * Scale up to rtime / total then add | ||
521 | * the remainder scaled to stime / total. | ||
522 | */ | ||
523 | res = div64_u64_rem(rtime, total, &rem); | ||
524 | scaled = stime * res; | ||
525 | scaled += div64_u64(stime * rem, total); | ||
526 | } else { | ||
527 | /* | ||
528 | * Same in reverse: scale down to total / rtime | ||
529 | * then substract that result scaled to | ||
530 | * to the remaining part. | ||
531 | */ | ||
532 | res = div64_u64_rem(total, rtime, &rem); | ||
533 | scaled = div64_u64(stime, res); | ||
534 | scaled -= div64_u64(scaled * rem, total); | ||
535 | } | ||
536 | |||
537 | return (__force cputime_t) scaled; | ||
536 | } | 538 | } |
537 | 539 | ||
538 | /* | 540 | /* |
@@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr, | |||
545 | { | 547 | { |
546 | cputime_t rtime, stime, total; | 548 | cputime_t rtime, stime, total; |
547 | 549 | ||
550 | if (vtime_accounting_enabled()) { | ||
551 | *ut = curr->utime; | ||
552 | *st = curr->stime; | ||
553 | return; | ||
554 | } | ||
555 | |||
548 | stime = curr->stime; | 556 | stime = curr->stime; |
549 | total = stime + curr->utime; | 557 | total = stime + curr->utime; |
550 | 558 | ||
@@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr, | |||
560 | */ | 568 | */ |
561 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 569 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
562 | 570 | ||
563 | if (total) | 571 | if (!rtime) { |
564 | stime = scale_stime(stime, rtime, total); | 572 | stime = 0; |
565 | else | 573 | } else if (!total) { |
566 | stime = rtime; | 574 | stime = rtime; |
575 | } else { | ||
576 | stime = scale_stime((__force u64)stime, | ||
577 | (__force u64)rtime, (__force u64)total); | ||
578 | } | ||
567 | 579 | ||
568 | /* | 580 | /* |
569 | * If the tick based count grows faster than the scheduler one, | 581 | * If the tick based count grows faster than the scheduler one, |
@@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
597 | thread_group_cputime(p, &cputime); | 609 | thread_group_cputime(p, &cputime); |
598 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | 610 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
599 | } | 611 | } |
600 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 612 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
601 | 613 | ||
602 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 614 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
603 | static unsigned long long vtime_delta(struct task_struct *tsk) | 615 | static unsigned long long vtime_delta(struct task_struct *tsk) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc5..8bf7081b1ec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); | |||
431 | * Scheduling class tree data structure manipulation methods: | 431 | * Scheduling class tree data structure manipulation methods: |
432 | */ | 432 | */ |
433 | 433 | ||
434 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | 434 | static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) |
435 | { | 435 | { |
436 | s64 delta = (s64)(vruntime - min_vruntime); | 436 | s64 delta = (s64)(vruntime - max_vruntime); |
437 | if (delta > 0) | 437 | if (delta > 0) |
438 | min_vruntime = vruntime; | 438 | max_vruntime = vruntime; |
439 | 439 | ||
440 | return min_vruntime; | 440 | return max_vruntime; |
441 | } | 441 | } |
442 | 442 | ||
443 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | 443 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) |
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
473 | vruntime = min_vruntime(vruntime, se->vruntime); | 473 | vruntime = min_vruntime(vruntime, se->vruntime); |
474 | } | 474 | } |
475 | 475 | ||
476 | /* ensure we never gain time by being placed backwards. */ | ||
476 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 477 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
477 | #ifndef CONFIG_64BIT | 478 | #ifndef CONFIG_64BIT |
478 | smp_wmb(); | 479 | smp_wmb(); |
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
652 | } | 653 | } |
653 | 654 | ||
654 | /* | 655 | /* |
655 | * We calculate the vruntime slice of a to be inserted task | 656 | * We calculate the vruntime slice of a to-be-inserted task. |
656 | * | 657 | * |
657 | * vs = s/w | 658 | * vs = s/w |
658 | */ | 659 | */ |
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1562 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 1563 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
1563 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | 1564 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ |
1564 | } | 1565 | } |
1566 | |||
1567 | /* | ||
1568 | * Update the rq's load with the elapsed running time before entering | ||
1569 | * idle. if the last scheduled task is not a CFS task, idle_enter will | ||
1570 | * be the only way to update the runnable statistic. | ||
1571 | */ | ||
1572 | void idle_enter_fair(struct rq *this_rq) | ||
1573 | { | ||
1574 | update_rq_runnable_avg(this_rq, 1); | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Update the rq's load with the elapsed idle time before a task is | ||
1579 | * scheduled. if the newly scheduled task is not a CFS task, idle_exit will | ||
1580 | * be the only way to update the runnable statistic. | ||
1581 | */ | ||
1582 | void idle_exit_fair(struct rq *this_rq) | ||
1583 | { | ||
1584 | update_rq_runnable_avg(this_rq, 0); | ||
1585 | } | ||
1586 | |||
1565 | #else | 1587 | #else |
1566 | static inline void update_entity_load_avg(struct sched_entity *se, | 1588 | static inline void update_entity_load_avg(struct sched_entity *se, |
1567 | int update_cfs_rq) {} | 1589 | int update_cfs_rq) {} |
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3874 | int tsk_cache_hot = 0; | 3896 | int tsk_cache_hot = 0; |
3875 | /* | 3897 | /* |
3876 | * We do not migrate tasks that are: | 3898 | * We do not migrate tasks that are: |
3877 | * 1) running (obviously), or | 3899 | * 1) throttled_lb_pair, or |
3878 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3900 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3879 | * 3) are cache-hot on their current CPU. | 3901 | * 3) running (obviously), or |
3902 | * 4) are cache-hot on their current CPU. | ||
3880 | */ | 3903 | */ |
3904 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | ||
3905 | return 0; | ||
3906 | |||
3881 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 3907 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3882 | int new_dst_cpu; | 3908 | int cpu; |
3883 | 3909 | ||
3884 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3910 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3885 | 3911 | ||
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3894 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 3920 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) |
3895 | return 0; | 3921 | return 0; |
3896 | 3922 | ||
3897 | new_dst_cpu = cpumask_first_and(env->dst_grpmask, | 3923 | /* Prevent to re-select dst_cpu via env's cpus */ |
3898 | tsk_cpus_allowed(p)); | 3924 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
3899 | if (new_dst_cpu < nr_cpu_ids) { | 3925 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
3900 | env->flags |= LBF_SOME_PINNED; | 3926 | env->flags |= LBF_SOME_PINNED; |
3901 | env->new_dst_cpu = new_dst_cpu; | 3927 | env->new_dst_cpu = cpu; |
3928 | break; | ||
3929 | } | ||
3902 | } | 3930 | } |
3931 | |||
3903 | return 0; | 3932 | return 0; |
3904 | } | 3933 | } |
3905 | 3934 | ||
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3920 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); | 3949 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3921 | if (!tsk_cache_hot || | 3950 | if (!tsk_cache_hot || |
3922 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 3951 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3923 | #ifdef CONFIG_SCHEDSTATS | 3952 | |
3924 | if (tsk_cache_hot) { | 3953 | if (tsk_cache_hot) { |
3925 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 3954 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3926 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3955 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3927 | } | 3956 | } |
3928 | #endif | 3957 | |
3929 | return 1; | 3958 | return 1; |
3930 | } | 3959 | } |
3931 | 3960 | ||
3932 | if (tsk_cache_hot) { | 3961 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); |
3933 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 3962 | return 0; |
3934 | return 0; | ||
3935 | } | ||
3936 | return 1; | ||
3937 | } | 3963 | } |
3938 | 3964 | ||
3939 | /* | 3965 | /* |
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env) | |||
3948 | struct task_struct *p, *n; | 3974 | struct task_struct *p, *n; |
3949 | 3975 | ||
3950 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 3976 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3951 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) | ||
3952 | continue; | ||
3953 | |||
3954 | if (!can_migrate_task(p, env)) | 3977 | if (!can_migrate_task(p, env)) |
3955 | continue; | 3978 | continue; |
3956 | 3979 | ||
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env) | |||
4002 | break; | 4025 | break; |
4003 | } | 4026 | } |
4004 | 4027 | ||
4005 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | 4028 | if (!can_migrate_task(p, env)) |
4006 | goto next; | 4029 | goto next; |
4007 | 4030 | ||
4008 | load = task_h_load(p); | 4031 | load = task_h_load(p); |
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env) | |||
4013 | if ((load / 2) > env->imbalance) | 4036 | if ((load / 2) > env->imbalance) |
4014 | goto next; | 4037 | goto next; |
4015 | 4038 | ||
4016 | if (!can_migrate_task(p, env)) | ||
4017 | goto next; | ||
4018 | |||
4019 | move_task(p, env); | 4039 | move_task(p, env); |
4020 | pulled++; | 4040 | pulled++; |
4021 | env->imbalance -= load; | 4041 | env->imbalance -= load; |
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
4245 | return load_idx; | 4265 | return load_idx; |
4246 | } | 4266 | } |
4247 | 4267 | ||
4248 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 4268 | static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
4249 | { | 4269 | { |
4250 | return SCHED_POWER_SCALE; | 4270 | return SCHED_POWER_SCALE; |
4251 | } | 4271 | } |
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | |||
4255 | return default_scale_freq_power(sd, cpu); | 4275 | return default_scale_freq_power(sd, cpu); |
4256 | } | 4276 | } |
4257 | 4277 | ||
4258 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | 4278 | static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) |
4259 | { | 4279 | { |
4260 | unsigned long weight = sd->span_weight; | 4280 | unsigned long weight = sd->span_weight; |
4261 | unsigned long smt_gain = sd->smt_gain; | 4281 | unsigned long smt_gain = sd->smt_gain; |
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
4270 | return default_scale_smt_power(sd, cpu); | 4290 | return default_scale_smt_power(sd, cpu); |
4271 | } | 4291 | } |
4272 | 4292 | ||
4273 | unsigned long scale_rt_power(int cpu) | 4293 | static unsigned long scale_rt_power(int cpu) |
4274 | { | 4294 | { |
4275 | struct rq *rq = cpu_rq(cpu); | 4295 | struct rq *rq = cpu_rq(cpu); |
4276 | u64 total, available, age_stamp, avg; | 4296 | u64 total, available, age_stamp, avg; |
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4960 | #define MAX_PINNED_INTERVAL 512 | 4980 | #define MAX_PINNED_INTERVAL 512 |
4961 | 4981 | ||
4962 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4982 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4963 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4983 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); |
4964 | 4984 | ||
4965 | static int need_active_balance(struct lb_env *env) | 4985 | static int need_active_balance(struct lb_env *env) |
4966 | { | 4986 | { |
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4991 | int *balance) | 5011 | int *balance) |
4992 | { | 5012 | { |
4993 | int ld_moved, cur_ld_moved, active_balance = 0; | 5013 | int ld_moved, cur_ld_moved, active_balance = 0; |
4994 | int lb_iterations, max_lb_iterations; | ||
4995 | struct sched_group *group; | 5014 | struct sched_group *group; |
4996 | struct rq *busiest; | 5015 | struct rq *busiest; |
4997 | unsigned long flags; | 5016 | unsigned long flags; |
4998 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 5017 | struct cpumask *cpus = __get_cpu_var(load_balance_mask); |
4999 | 5018 | ||
5000 | struct lb_env env = { | 5019 | struct lb_env env = { |
5001 | .sd = sd, | 5020 | .sd = sd, |
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5007 | .cpus = cpus, | 5026 | .cpus = cpus, |
5008 | }; | 5027 | }; |
5009 | 5028 | ||
5029 | /* | ||
5030 | * For NEWLY_IDLE load_balancing, we don't need to consider | ||
5031 | * other cpus in our group | ||
5032 | */ | ||
5033 | if (idle == CPU_NEWLY_IDLE) | ||
5034 | env.dst_grpmask = NULL; | ||
5035 | |||
5010 | cpumask_copy(cpus, cpu_active_mask); | 5036 | cpumask_copy(cpus, cpu_active_mask); |
5011 | max_lb_iterations = cpumask_weight(env.dst_grpmask); | ||
5012 | 5037 | ||
5013 | schedstat_inc(sd, lb_count[idle]); | 5038 | schedstat_inc(sd, lb_count[idle]); |
5014 | 5039 | ||
@@ -5034,7 +5059,6 @@ redo: | |||
5034 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 5059 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
5035 | 5060 | ||
5036 | ld_moved = 0; | 5061 | ld_moved = 0; |
5037 | lb_iterations = 1; | ||
5038 | if (busiest->nr_running > 1) { | 5062 | if (busiest->nr_running > 1) { |
5039 | /* | 5063 | /* |
5040 | * Attempt to move tasks. If find_busiest_group has found | 5064 | * Attempt to move tasks. If find_busiest_group has found |
@@ -5061,17 +5085,17 @@ more_balance: | |||
5061 | double_rq_unlock(env.dst_rq, busiest); | 5085 | double_rq_unlock(env.dst_rq, busiest); |
5062 | local_irq_restore(flags); | 5086 | local_irq_restore(flags); |
5063 | 5087 | ||
5064 | if (env.flags & LBF_NEED_BREAK) { | ||
5065 | env.flags &= ~LBF_NEED_BREAK; | ||
5066 | goto more_balance; | ||
5067 | } | ||
5068 | |||
5069 | /* | 5088 | /* |
5070 | * some other cpu did the load balance for us. | 5089 | * some other cpu did the load balance for us. |
5071 | */ | 5090 | */ |
5072 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 5091 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) |
5073 | resched_cpu(env.dst_cpu); | 5092 | resched_cpu(env.dst_cpu); |
5074 | 5093 | ||
5094 | if (env.flags & LBF_NEED_BREAK) { | ||
5095 | env.flags &= ~LBF_NEED_BREAK; | ||
5096 | goto more_balance; | ||
5097 | } | ||
5098 | |||
5075 | /* | 5099 | /* |
5076 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 5100 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
5077 | * us and move them to an alternate dst_cpu in our sched_group | 5101 | * us and move them to an alternate dst_cpu in our sched_group |
@@ -5091,14 +5115,17 @@ more_balance: | |||
5091 | * moreover subsequent load balance cycles should correct the | 5115 | * moreover subsequent load balance cycles should correct the |
5092 | * excess load moved. | 5116 | * excess load moved. |
5093 | */ | 5117 | */ |
5094 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 5118 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { |
5095 | lb_iterations++ < max_lb_iterations) { | ||
5096 | 5119 | ||
5097 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 5120 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
5098 | env.dst_cpu = env.new_dst_cpu; | 5121 | env.dst_cpu = env.new_dst_cpu; |
5099 | env.flags &= ~LBF_SOME_PINNED; | 5122 | env.flags &= ~LBF_SOME_PINNED; |
5100 | env.loop = 0; | 5123 | env.loop = 0; |
5101 | env.loop_break = sched_nr_migrate_break; | 5124 | env.loop_break = sched_nr_migrate_break; |
5125 | |||
5126 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
5127 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5128 | |||
5102 | /* | 5129 | /* |
5103 | * Go back to "more_balance" rather than "redo" since we | 5130 | * Go back to "more_balance" rather than "redo" since we |
5104 | * need to continue with same src_cpu. | 5131 | * need to continue with same src_cpu. |
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5219 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5246 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
5220 | return; | 5247 | return; |
5221 | 5248 | ||
5222 | update_rq_runnable_avg(this_rq, 1); | ||
5223 | |||
5224 | /* | 5249 | /* |
5225 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5250 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
5226 | */ | 5251 | */ |
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) | |||
5395 | struct sched_domain *sd; | 5420 | struct sched_domain *sd; |
5396 | int cpu = smp_processor_id(); | 5421 | int cpu = smp_processor_id(); |
5397 | 5422 | ||
5398 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5399 | return; | ||
5400 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5401 | |||
5402 | rcu_read_lock(); | 5423 | rcu_read_lock(); |
5403 | for_each_domain(cpu, sd) | 5424 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5425 | |||
5426 | if (!sd || !sd->nohz_idle) | ||
5427 | goto unlock; | ||
5428 | sd->nohz_idle = 0; | ||
5429 | |||
5430 | for (; sd; sd = sd->parent) | ||
5404 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | 5431 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
5432 | unlock: | ||
5405 | rcu_read_unlock(); | 5433 | rcu_read_unlock(); |
5406 | } | 5434 | } |
5407 | 5435 | ||
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void) | |||
5410 | struct sched_domain *sd; | 5438 | struct sched_domain *sd; |
5411 | int cpu = smp_processor_id(); | 5439 | int cpu = smp_processor_id(); |
5412 | 5440 | ||
5413 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5414 | return; | ||
5415 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5416 | |||
5417 | rcu_read_lock(); | 5441 | rcu_read_lock(); |
5418 | for_each_domain(cpu, sd) | 5442 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5443 | |||
5444 | if (!sd || sd->nohz_idle) | ||
5445 | goto unlock; | ||
5446 | sd->nohz_idle = 1; | ||
5447 | |||
5448 | for (; sd; sd = sd->parent) | ||
5419 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | 5449 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
5450 | unlock: | ||
5420 | rcu_read_unlock(); | 5451 | rcu_read_unlock(); |
5421 | } | 5452 | } |
5422 | 5453 | ||
@@ -5468,7 +5499,7 @@ void update_max_interval(void) | |||
5468 | * It checks each scheduling domain to see if it is due to be balanced, | 5499 | * It checks each scheduling domain to see if it is due to be balanced, |
5469 | * and initiates a balancing operation if so. | 5500 | * and initiates a balancing operation if so. |
5470 | * | 5501 | * |
5471 | * Balancing parameters are set up in arch_init_sched_domains. | 5502 | * Balancing parameters are set up in init_sched_domains. |
5472 | */ | 5503 | */ |
5473 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5504 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5474 | { | 5505 | { |
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5506 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5537 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5507 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5538 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
5508 | /* | 5539 | /* |
5509 | * We've pulled tasks over so either we're no | 5540 | * The LBF_SOME_PINNED logic could have changed |
5510 | * longer idle. | 5541 | * env->dst_cpu, so we can't know our idle |
5542 | * state even if we migrated tasks. Update it. | ||
5511 | */ | 5543 | */ |
5512 | idle = CPU_NOT_IDLE; | 5544 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; |
5513 | } | 5545 | } |
5514 | sd->last_balance = jiffies; | 5546 | sd->last_balance = jiffies; |
5515 | } | 5547 | } |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | |||
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
16 | |||
17 | static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) | ||
18 | { | ||
19 | idle_exit_fair(rq); | ||
20 | } | ||
21 | |||
22 | static void post_schedule_idle(struct rq *rq) | ||
23 | { | ||
24 | idle_enter_fair(rq); | ||
25 | } | ||
16 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
17 | /* | 27 | /* |
18 | * Idle tasks are unconditionally rescheduled: | 28 | * Idle tasks are unconditionally rescheduled: |
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 35 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 36 | { |
27 | schedstat_inc(rq, sched_goidle); | 37 | schedstat_inc(rq, sched_goidle); |
38 | #ifdef CONFIG_SMP | ||
39 | /* Trigger the post schedule to do an idle_enter for CFS */ | ||
40 | rq->post_schedule = 1; | ||
41 | #endif | ||
28 | return rq->idle; | 42 | return rq->idle; |
29 | } | 43 | } |
30 | 44 | ||
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { | |||
86 | 100 | ||
87 | #ifdef CONFIG_SMP | 101 | #ifdef CONFIG_SMP |
88 | .select_task_rq = select_task_rq_idle, | 102 | .select_task_rq = select_task_rq_idle, |
103 | .pre_schedule = pre_schedule_idle, | ||
104 | .post_schedule = post_schedule_idle, | ||
89 | #endif | 105 | #endif |
90 | 106 | ||
91 | .set_curr_task = set_curr_task_idle, | 107 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469f..4c225c4c7111 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
8 | 8 | ||
9 | #include "cpupri.h" | 9 | #include "cpupri.h" |
10 | #include "cpuacct.h" | ||
10 | 11 | ||
11 | extern __read_mostly int scheduler_running; | 12 | extern __read_mostly int scheduler_running; |
12 | 13 | ||
@@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running; | |||
33 | */ | 34 | */ |
34 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 35 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
35 | 36 | ||
37 | /* | ||
38 | * Increase resolution of nice-level calculations for 64-bit architectures. | ||
39 | * The extra resolution improves shares distribution and load balancing of | ||
40 | * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup | ||
41 | * hierarchies, especially on larger systems. This is not a user-visible change | ||
42 | * and does not change the user-interface for setting shares/weights. | ||
43 | * | ||
44 | * We increase resolution only if we have enough bits to allow this increased | ||
45 | * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution | ||
46 | * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the | ||
47 | * increased costs. | ||
48 | */ | ||
49 | #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ | ||
50 | # define SCHED_LOAD_RESOLUTION 10 | ||
51 | # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) | ||
52 | # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) | ||
53 | #else | ||
54 | # define SCHED_LOAD_RESOLUTION 0 | ||
55 | # define scale_load(w) (w) | ||
56 | # define scale_load_down(w) (w) | ||
57 | #endif | ||
58 | |||
59 | #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) | ||
60 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) | ||
61 | |||
36 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 62 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
37 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 63 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
38 | 64 | ||
@@ -154,11 +180,6 @@ struct task_group { | |||
154 | #define MAX_SHARES (1UL << 18) | 180 | #define MAX_SHARES (1UL << 18) |
155 | #endif | 181 | #endif |
156 | 182 | ||
157 | /* Default task group. | ||
158 | * Every task in system belong to this group at bootup. | ||
159 | */ | ||
160 | extern struct task_group root_task_group; | ||
161 | |||
162 | typedef int (*tg_visitor)(struct task_group *, void *); | 183 | typedef int (*tg_visitor)(struct task_group *, void *); |
163 | 184 | ||
164 | extern int walk_tg_tree_from(struct task_group *from, | 185 | extern int walk_tg_tree_from(struct task_group *from, |
@@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
196 | struct sched_rt_entity *rt_se, int cpu, | 217 | struct sched_rt_entity *rt_se, int cpu, |
197 | struct sched_rt_entity *parent); | 218 | struct sched_rt_entity *parent); |
198 | 219 | ||
220 | extern struct task_group *sched_create_group(struct task_group *parent); | ||
221 | extern void sched_online_group(struct task_group *tg, | ||
222 | struct task_group *parent); | ||
223 | extern void sched_destroy_group(struct task_group *tg); | ||
224 | extern void sched_offline_group(struct task_group *tg); | ||
225 | |||
226 | extern void sched_move_task(struct task_struct *tsk); | ||
227 | |||
228 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
229 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
230 | #endif | ||
231 | |||
199 | #else /* CONFIG_CGROUP_SCHED */ | 232 | #else /* CONFIG_CGROUP_SCHED */ |
200 | 233 | ||
201 | struct cfs_bandwidth { }; | 234 | struct cfs_bandwidth { }; |
@@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
547 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 580 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
548 | DECLARE_PER_CPU(int, sd_llc_id); | 581 | DECLARE_PER_CPU(int, sd_llc_id); |
549 | 582 | ||
583 | struct sched_group_power { | ||
584 | atomic_t ref; | ||
585 | /* | ||
586 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | ||
587 | * single CPU. | ||
588 | */ | ||
589 | unsigned int power, power_orig; | ||
590 | unsigned long next_update; | ||
591 | /* | ||
592 | * Number of busy cpus in this group. | ||
593 | */ | ||
594 | atomic_t nr_busy_cpus; | ||
595 | |||
596 | unsigned long cpumask[0]; /* iteration mask */ | ||
597 | }; | ||
598 | |||
599 | struct sched_group { | ||
600 | struct sched_group *next; /* Must be a circular list */ | ||
601 | atomic_t ref; | ||
602 | |||
603 | unsigned int group_weight; | ||
604 | struct sched_group_power *sgp; | ||
605 | |||
606 | /* | ||
607 | * The CPUs this group covers. | ||
608 | * | ||
609 | * NOTE: this field is variable length. (Allocated dynamically | ||
610 | * by attaching extra space to the end of the structure, | ||
611 | * depending on how many CPUs the kernel has booted up with) | ||
612 | */ | ||
613 | unsigned long cpumask[0]; | ||
614 | }; | ||
615 | |||
616 | static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | ||
617 | { | ||
618 | return to_cpumask(sg->cpumask); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * cpumask masking which cpus in the group are allowed to iterate up the domain | ||
623 | * tree. | ||
624 | */ | ||
625 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | ||
626 | { | ||
627 | return to_cpumask(sg->sgp->cpumask); | ||
628 | } | ||
629 | |||
630 | /** | ||
631 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
632 | * @group: The group whose first cpu is to be returned. | ||
633 | */ | ||
634 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
635 | { | ||
636 | return cpumask_first(sched_group_cpus(group)); | ||
637 | } | ||
638 | |||
550 | extern int group_balance_cpu(struct sched_group *sg); | 639 | extern int group_balance_cpu(struct sched_group *sg); |
551 | 640 | ||
552 | #endif /* CONFIG_SMP */ | 641 | #endif /* CONFIG_SMP */ |
@@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
784 | } | 873 | } |
785 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 874 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
786 | 875 | ||
876 | /* | ||
877 | * wake flags | ||
878 | */ | ||
879 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | ||
880 | #define WF_FORK 0x02 /* child wakeup after fork */ | ||
881 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | ||
787 | 882 | ||
788 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 883 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
789 | { | 884 | { |
@@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = { | |||
856 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 951 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
857 | }; | 952 | }; |
858 | 953 | ||
859 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 954 | #define ENQUEUE_WAKEUP 1 |
860 | enum cpuacct_stat_index { | 955 | #define ENQUEUE_HEAD 2 |
861 | CPUACCT_STAT_USER, /* ... user mode */ | 956 | #ifdef CONFIG_SMP |
862 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 957 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ |
958 | #else | ||
959 | #define ENQUEUE_WAKING 0 | ||
960 | #endif | ||
863 | 961 | ||
864 | CPUACCT_STAT_NSTATS, | 962 | #define DEQUEUE_SLEEP 1 |
865 | }; | 963 | |
964 | struct sched_class { | ||
965 | const struct sched_class *next; | ||
966 | |||
967 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | ||
968 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | ||
969 | void (*yield_task) (struct rq *rq); | ||
970 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | ||
971 | |||
972 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | ||
973 | |||
974 | struct task_struct * (*pick_next_task) (struct rq *rq); | ||
975 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | ||
976 | |||
977 | #ifdef CONFIG_SMP | ||
978 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | ||
979 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | ||
980 | |||
981 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | ||
982 | void (*post_schedule) (struct rq *this_rq); | ||
983 | void (*task_waking) (struct task_struct *task); | ||
984 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | ||
985 | |||
986 | void (*set_cpus_allowed)(struct task_struct *p, | ||
987 | const struct cpumask *newmask); | ||
866 | 988 | ||
989 | void (*rq_online)(struct rq *rq); | ||
990 | void (*rq_offline)(struct rq *rq); | ||
991 | #endif | ||
992 | |||
993 | void (*set_curr_task) (struct rq *rq); | ||
994 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | ||
995 | void (*task_fork) (struct task_struct *p); | ||
996 | |||
997 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | ||
998 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | ||
999 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | ||
1000 | int oldprio); | ||
1001 | |||
1002 | unsigned int (*get_rr_interval) (struct rq *rq, | ||
1003 | struct task_struct *task); | ||
1004 | |||
1005 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1006 | void (*task_move_group) (struct task_struct *p, int on_rq); | ||
1007 | #endif | ||
1008 | }; | ||
867 | 1009 | ||
868 | #define sched_class_highest (&stop_sched_class) | 1010 | #define sched_class_highest (&stop_sched_class) |
869 | #define for_each_class(class) \ | 1011 | #define for_each_class(class) \ |
@@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class; | |||
877 | 1019 | ||
878 | #ifdef CONFIG_SMP | 1020 | #ifdef CONFIG_SMP |
879 | 1021 | ||
1022 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
1023 | |||
880 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1024 | extern void trigger_load_balance(struct rq *rq, int cpu); |
881 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1025 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
882 | 1026 | ||
1027 | /* | ||
1028 | * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg | ||
1029 | * becomes useful in lb | ||
1030 | */ | ||
1031 | #if defined(CONFIG_FAIR_GROUP_SCHED) | ||
1032 | extern void idle_enter_fair(struct rq *this_rq); | ||
1033 | extern void idle_exit_fair(struct rq *this_rq); | ||
1034 | #else | ||
1035 | static inline void idle_enter_fair(struct rq *this_rq) {} | ||
1036 | static inline void idle_exit_fair(struct rq *this_rq) {} | ||
1037 | #endif | ||
1038 | |||
883 | #else /* CONFIG_SMP */ | 1039 | #else /* CONFIG_SMP */ |
884 | 1040 | ||
885 | static inline void idle_balance(int cpu, struct rq *rq) | 1041 | static inline void idle_balance(int cpu, struct rq *rq) |
@@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
891 | extern void sysrq_sched_debug_show(void); | 1047 | extern void sysrq_sched_debug_show(void); |
892 | extern void sched_init_granularity(void); | 1048 | extern void sched_init_granularity(void); |
893 | extern void update_max_interval(void); | 1049 | extern void update_max_interval(void); |
894 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
895 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | 1050 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); |
896 | extern void init_sched_rt_class(void); | 1051 | extern void init_sched_rt_class(void); |
897 | extern void init_sched_fair_class(void); | 1052 | extern void init_sched_fair_class(void); |
@@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
904 | 1059 | ||
905 | extern void update_idle_cpu_load(struct rq *this_rq); | 1060 | extern void update_idle_cpu_load(struct rq *this_rq); |
906 | 1061 | ||
907 | #ifdef CONFIG_CGROUP_CPUACCT | ||
908 | #include <linux/cgroup.h> | ||
909 | /* track cpu usage of a group of tasks and its child groups */ | ||
910 | struct cpuacct { | ||
911 | struct cgroup_subsys_state css; | ||
912 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
913 | u64 __percpu *cpuusage; | ||
914 | struct kernel_cpustat __percpu *cpustat; | ||
915 | }; | ||
916 | |||
917 | extern struct cgroup_subsys cpuacct_subsys; | ||
918 | extern struct cpuacct root_cpuacct; | ||
919 | |||
920 | /* return cpu accounting group corresponding to this container */ | ||
921 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
922 | { | ||
923 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
924 | struct cpuacct, css); | ||
925 | } | ||
926 | |||
927 | /* return cpu accounting group to which this task belongs */ | ||
928 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
929 | { | ||
930 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
931 | struct cpuacct, css); | ||
932 | } | ||
933 | |||
934 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
935 | { | ||
936 | if (!ca || !ca->css.cgroup->parent) | ||
937 | return NULL; | ||
938 | return cgroup_ca(ca->css.cgroup->parent); | ||
939 | } | ||
940 | |||
941 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
942 | #else | ||
943 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
944 | #endif | ||
945 | |||
946 | #ifdef CONFIG_PARAVIRT | 1062 | #ifdef CONFIG_PARAVIRT |
947 | static inline u64 steal_ticks(u64 steal) | 1063 | static inline u64 steal_ticks(u64 steal) |
948 | { | 1064 | { |
@@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | |||
1187 | enum rq_nohz_flag_bits { | 1303 | enum rq_nohz_flag_bits { |
1188 | NOHZ_TICK_STOPPED, | 1304 | NOHZ_TICK_STOPPED, |
1189 | NOHZ_BALANCE_KICK, | 1305 | NOHZ_BALANCE_KICK, |
1190 | NOHZ_IDLE, | ||
1191 | }; | 1306 | }; |
1192 | 1307 | ||
1193 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1308 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |