diff options
author | Paul Turner <pjt@google.com> | 2011-07-21 12:43:28 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-08-14 06:03:20 -0400 |
commit | ab84d31e15502fb626169ba2663381e34bf965b2 (patch) | |
tree | 658ce7caa6199aa74c5feea92ec8d3e9a2cb4296 | |
parent | 953bfcd10e6f3697233e8e5128c611d275da39c1 (diff) |
sched: Introduce primitives to account for CFS bandwidth tracking
In this patch we introduce the notion of CFS bandwidth, partitioned into
globally unassigned bandwidth, and locally claimed bandwidth.
- The global bandwidth is per task_group, it represents a pool of unclaimed
bandwidth that cfs_rqs can allocate from.
- The local bandwidth is tracked per-cfs_rq, this represents allotments from
the global pool bandwidth assigned to a specific cpu.
Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
- cpu.cfs_period_us : the bandwidth period in usecs
- cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
to consume over period above.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | init/Kconfig | 12 | ||||
-rw-r--r-- | kernel/sched.c | 196 | ||||
-rw-r--r-- | kernel/sched_fair.c | 16 |
3 files changed, 220 insertions, 4 deletions
diff --git a/init/Kconfig b/init/Kconfig index d62778390e55..d19b3a77ab44 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED | |||
715 | depends on CGROUP_SCHED | 715 | depends on CGROUP_SCHED |
716 | default CGROUP_SCHED | 716 | default CGROUP_SCHED |
717 | 717 | ||
718 | config CFS_BANDWIDTH | ||
719 | bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" | ||
720 | depends on EXPERIMENTAL | ||
721 | depends on FAIR_GROUP_SCHED | ||
722 | default n | ||
723 | help | ||
724 | This option allows users to define CPU bandwidth rates (limits) for | ||
725 | tasks running within the fair group scheduler. Groups with no limit | ||
726 | set are considered to be unconstrained and will run with no | ||
727 | restriction. | ||
728 | See tip/Documentation/scheduler/sched-bwc.txt for more information. | ||
729 | |||
718 | config RT_GROUP_SCHED | 730 | config RT_GROUP_SCHED |
719 | bool "Group scheduling for SCHED_RR/FIFO" | 731 | bool "Group scheduling for SCHED_RR/FIFO" |
720 | depends on EXPERIMENTAL | 732 | depends on EXPERIMENTAL |
diff --git a/kernel/sched.c b/kernel/sched.c index cd1a531ca8ff..f08cb23be96c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -247,6 +247,14 @@ struct cfs_rq; | |||
247 | 247 | ||
248 | static LIST_HEAD(task_groups); | 248 | static LIST_HEAD(task_groups); |
249 | 249 | ||
250 | struct cfs_bandwidth { | ||
251 | #ifdef CONFIG_CFS_BANDWIDTH | ||
252 | raw_spinlock_t lock; | ||
253 | ktime_t period; | ||
254 | u64 quota; | ||
255 | #endif | ||
256 | }; | ||
257 | |||
250 | /* task group related information */ | 258 | /* task group related information */ |
251 | struct task_group { | 259 | struct task_group { |
252 | struct cgroup_subsys_state css; | 260 | struct cgroup_subsys_state css; |
@@ -278,6 +286,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 286 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 287 | struct autogroup *autogroup; |
280 | #endif | 288 | #endif |
289 | |||
290 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 291 | }; |
282 | 292 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 293 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -377,9 +387,48 @@ struct cfs_rq { | |||
377 | 387 | ||
378 | unsigned long load_contribution; | 388 | unsigned long load_contribution; |
379 | #endif | 389 | #endif |
390 | #ifdef CONFIG_CFS_BANDWIDTH | ||
391 | int runtime_enabled; | ||
392 | s64 runtime_remaining; | ||
393 | #endif | ||
380 | #endif | 394 | #endif |
381 | }; | 395 | }; |
382 | 396 | ||
397 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
398 | #ifdef CONFIG_CFS_BANDWIDTH | ||
399 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
400 | { | ||
401 | return &tg->cfs_bandwidth; | ||
402 | } | ||
403 | |||
404 | static inline u64 default_cfs_period(void); | ||
405 | |||
406 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
407 | { | ||
408 | raw_spin_lock_init(&cfs_b->lock); | ||
409 | cfs_b->quota = RUNTIME_INF; | ||
410 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
411 | } | ||
412 | |||
413 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
414 | { | ||
415 | cfs_rq->runtime_enabled = 0; | ||
416 | } | ||
417 | |||
418 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
419 | {} | ||
420 | #else | ||
421 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
422 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
423 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
424 | |||
425 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
426 | { | ||
427 | return NULL; | ||
428 | } | ||
429 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
430 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
431 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 432 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 433 | struct rt_rq { |
385 | struct rt_prio_array active; | 434 | struct rt_prio_array active; |
@@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7971 | /* allow initial update_cfs_load() to truncate */ | 8020 | /* allow initial update_cfs_load() to truncate */ |
7972 | cfs_rq->load_stamp = 1; | 8021 | cfs_rq->load_stamp = 1; |
7973 | #endif | 8022 | #endif |
8023 | init_cfs_rq_runtime(cfs_rq); | ||
7974 | 8024 | ||
7975 | tg->cfs_rq[cpu] = cfs_rq; | 8025 | tg->cfs_rq[cpu] = cfs_rq; |
7976 | tg->se[cpu] = se; | 8026 | tg->se[cpu] = se; |
@@ -8110,6 +8160,7 @@ void __init sched_init(void) | |||
8110 | * We achieve this by letting root_task_group's tasks sit | 8160 | * We achieve this by letting root_task_group's tasks sit |
8111 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8161 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8112 | */ | 8162 | */ |
8163 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8113 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8164 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8114 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8165 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8115 | 8166 | ||
@@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8351 | { | 8402 | { |
8352 | int i; | 8403 | int i; |
8353 | 8404 | ||
8405 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8406 | |||
8354 | for_each_possible_cpu(i) { | 8407 | for_each_possible_cpu(i) { |
8355 | if (tg->cfs_rq) | 8408 | if (tg->cfs_rq) |
8356 | kfree(tg->cfs_rq[i]); | 8409 | kfree(tg->cfs_rq[i]); |
@@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8378 | 8431 | ||
8379 | tg->shares = NICE_0_LOAD; | 8432 | tg->shares = NICE_0_LOAD; |
8380 | 8433 | ||
8434 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8435 | |||
8381 | for_each_possible_cpu(i) { | 8436 | for_each_possible_cpu(i) { |
8382 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8437 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8383 | GFP_KERNEL, cpu_to_node(i)); | 8438 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
8753 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8808 | return walk_tg_tree(tg_schedulable, tg_nop, &data); |
8754 | } | 8809 | } |
8755 | 8810 | ||
8756 | static int tg_set_bandwidth(struct task_group *tg, | 8811 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8757 | u64 rt_period, u64 rt_runtime) | 8812 | u64 rt_period, u64 rt_runtime) |
8758 | { | 8813 | { |
8759 | int i, err = 0; | 8814 | int i, err = 0; |
@@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8792 | if (rt_runtime_us < 0) | 8847 | if (rt_runtime_us < 0) |
8793 | rt_runtime = RUNTIME_INF; | 8848 | rt_runtime = RUNTIME_INF; |
8794 | 8849 | ||
8795 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8850 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8796 | } | 8851 | } |
8797 | 8852 | ||
8798 | long sched_group_rt_runtime(struct task_group *tg) | 8853 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8817 | if (rt_period == 0) | 8872 | if (rt_period == 0) |
8818 | return -EINVAL; | 8873 | return -EINVAL; |
8819 | 8874 | ||
8820 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8875 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8821 | } | 8876 | } |
8822 | 8877 | ||
8823 | long sched_group_rt_period(struct task_group *tg) | 8878 | long sched_group_rt_period(struct task_group *tg) |
@@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
9007 | 9062 | ||
9008 | return (u64) scale_load_down(tg->shares); | 9063 | return (u64) scale_load_down(tg->shares); |
9009 | } | 9064 | } |
9065 | |||
9066 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9067 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9068 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9069 | |||
9070 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9071 | { | ||
9072 | int i; | ||
9073 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9074 | static DEFINE_MUTEX(mutex); | ||
9075 | |||
9076 | if (tg == &root_task_group) | ||
9077 | return -EINVAL; | ||
9078 | |||
9079 | /* | ||
9080 | * Ensure we have at some amount of bandwidth every period. This is | ||
9081 | * to prevent reaching a state of large arrears when throttled via | ||
9082 | * entity_tick() resulting in prolonged exit starvation. | ||
9083 | */ | ||
9084 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9085 | return -EINVAL; | ||
9086 | |||
9087 | /* | ||
9088 | * Likewise, bound things on the otherside by preventing insane quota | ||
9089 | * periods. This also allows us to normalize in computing quota | ||
9090 | * feasibility. | ||
9091 | */ | ||
9092 | if (period > max_cfs_quota_period) | ||
9093 | return -EINVAL; | ||
9094 | |||
9095 | mutex_lock(&mutex); | ||
9096 | raw_spin_lock_irq(&cfs_b->lock); | ||
9097 | cfs_b->period = ns_to_ktime(period); | ||
9098 | cfs_b->quota = quota; | ||
9099 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9100 | |||
9101 | for_each_possible_cpu(i) { | ||
9102 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9103 | struct rq *rq = rq_of(cfs_rq); | ||
9104 | |||
9105 | raw_spin_lock_irq(&rq->lock); | ||
9106 | cfs_rq->runtime_enabled = quota != RUNTIME_INF; | ||
9107 | cfs_rq->runtime_remaining = 0; | ||
9108 | raw_spin_unlock_irq(&rq->lock); | ||
9109 | } | ||
9110 | mutex_unlock(&mutex); | ||
9111 | |||
9112 | return 0; | ||
9113 | } | ||
9114 | |||
9115 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9116 | { | ||
9117 | u64 quota, period; | ||
9118 | |||
9119 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9120 | if (cfs_quota_us < 0) | ||
9121 | quota = RUNTIME_INF; | ||
9122 | else | ||
9123 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9124 | |||
9125 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9126 | } | ||
9127 | |||
9128 | long tg_get_cfs_quota(struct task_group *tg) | ||
9129 | { | ||
9130 | u64 quota_us; | ||
9131 | |||
9132 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9133 | return -1; | ||
9134 | |||
9135 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9136 | do_div(quota_us, NSEC_PER_USEC); | ||
9137 | |||
9138 | return quota_us; | ||
9139 | } | ||
9140 | |||
9141 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9142 | { | ||
9143 | u64 quota, period; | ||
9144 | |||
9145 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9146 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9147 | |||
9148 | if (period <= 0) | ||
9149 | return -EINVAL; | ||
9150 | |||
9151 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9152 | } | ||
9153 | |||
9154 | long tg_get_cfs_period(struct task_group *tg) | ||
9155 | { | ||
9156 | u64 cfs_period_us; | ||
9157 | |||
9158 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9159 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9160 | |||
9161 | return cfs_period_us; | ||
9162 | } | ||
9163 | |||
9164 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9165 | { | ||
9166 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9167 | } | ||
9168 | |||
9169 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9170 | s64 cfs_quota_us) | ||
9171 | { | ||
9172 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9173 | } | ||
9174 | |||
9175 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9176 | { | ||
9177 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9178 | } | ||
9179 | |||
9180 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9181 | u64 cfs_period_us) | ||
9182 | { | ||
9183 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9184 | } | ||
9185 | |||
9186 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
9010 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9187 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9011 | 9188 | ||
9012 | #ifdef CONFIG_RT_GROUP_SCHED | 9189 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = { | |||
9041 | .write_u64 = cpu_shares_write_u64, | 9218 | .write_u64 = cpu_shares_write_u64, |
9042 | }, | 9219 | }, |
9043 | #endif | 9220 | #endif |
9221 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9222 | { | ||
9223 | .name = "cfs_quota_us", | ||
9224 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9225 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9226 | }, | ||
9227 | { | ||
9228 | .name = "cfs_period_us", | ||
9229 | .read_u64 = cpu_cfs_period_read_u64, | ||
9230 | .write_u64 = cpu_cfs_period_write_u64, | ||
9231 | }, | ||
9232 | #endif | ||
9044 | #ifdef CONFIG_RT_GROUP_SCHED | 9233 | #ifdef CONFIG_RT_GROUP_SCHED |
9045 | { | 9234 | { |
9046 | .name = "rt_runtime_us", | 9235 | .name = "rt_runtime_us", |
@@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9350 | .subsys_id = cpuacct_subsys_id, | 9539 | .subsys_id = cpuacct_subsys_id, |
9351 | }; | 9540 | }; |
9352 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9541 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9353 | |||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f86b0cb5eb29..f24f4171019d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1234,6 +1234,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1234 | check_preempt_tick(cfs_rq, curr); | 1234 | check_preempt_tick(cfs_rq, curr); |
1235 | } | 1235 | } |
1236 | 1236 | ||
1237 | |||
1238 | /************************************************** | ||
1239 | * CFS bandwidth control machinery | ||
1240 | */ | ||
1241 | |||
1242 | #ifdef CONFIG_CFS_BANDWIDTH | ||
1243 | /* | ||
1244 | * default period for cfs group bandwidth. | ||
1245 | * default: 0.1s, units: nanoseconds | ||
1246 | */ | ||
1247 | static inline u64 default_cfs_period(void) | ||
1248 | { | ||
1249 | return 100000000ULL; | ||
1250 | } | ||
1251 | #endif | ||
1252 | |||
1237 | /************************************************** | 1253 | /************************************************** |
1238 | * CFS operations on tasks: | 1254 | * CFS operations on tasks: |
1239 | */ | 1255 | */ |