aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Turner <pjt@google.com>2011-07-21 12:43:28 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-14 06:03:20 -0400
commitab84d31e15502fb626169ba2663381e34bf965b2 (patch)
tree658ce7caa6199aa74c5feea92ec8d3e9a2cb4296
parent953bfcd10e6f3697233e8e5128c611d275da39c1 (diff)
sched: Introduce primitives to account for CFS bandwidth tracking
In this patch we introduce the notion of CFS bandwidth, partitioned into globally unassigned bandwidth, and locally claimed bandwidth. - The global bandwidth is per task_group, it represents a pool of unclaimed bandwidth that cfs_rqs can allocate from. - The local bandwidth is tracked per-cfs_rq, this represents allotments from the global pool bandwidth assigned to a specific cpu. Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem: - cpu.cfs_period_us : the bandwidth period in usecs - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed to consume over period above. Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Nikhil Rao <ncrao@google.com> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--init/Kconfig12
-rw-r--r--kernel/sched.c196
-rw-r--r--kernel/sched_fair.c16
3 files changed, 220 insertions, 4 deletions
diff --git a/init/Kconfig b/init/Kconfig
index d62778390e55..d19b3a77ab44 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
715 depends on CGROUP_SCHED 715 depends on CGROUP_SCHED
716 default CGROUP_SCHED 716 default CGROUP_SCHED
717 717
718config CFS_BANDWIDTH
719 bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
720 depends on EXPERIMENTAL
721 depends on FAIR_GROUP_SCHED
722 default n
723 help
724 This option allows users to define CPU bandwidth rates (limits) for
725 tasks running within the fair group scheduler. Groups with no limit
726 set are considered to be unconstrained and will run with no
727 restriction.
728 See tip/Documentation/scheduler/sched-bwc.txt for more information.
729
718config RT_GROUP_SCHED 730config RT_GROUP_SCHED
719 bool "Group scheduling for SCHED_RR/FIFO" 731 bool "Group scheduling for SCHED_RR/FIFO"
720 depends on EXPERIMENTAL 732 depends on EXPERIMENTAL
diff --git a/kernel/sched.c b/kernel/sched.c
index cd1a531ca8ff..f08cb23be96c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -247,6 +247,14 @@ struct cfs_rq;
247 247
248static LIST_HEAD(task_groups); 248static LIST_HEAD(task_groups);
249 249
250struct cfs_bandwidth {
251#ifdef CONFIG_CFS_BANDWIDTH
252 raw_spinlock_t lock;
253 ktime_t period;
254 u64 quota;
255#endif
256};
257
250/* task group related information */ 258/* task group related information */
251struct task_group { 259struct task_group {
252 struct cgroup_subsys_state css; 260 struct cgroup_subsys_state css;
@@ -278,6 +286,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 286#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 287 struct autogroup *autogroup;
280#endif 288#endif
289
290 struct cfs_bandwidth cfs_bandwidth;
281}; 291};
282 292
283/* task_group_lock serializes the addition/removal of task groups */ 293/* task_group_lock serializes the addition/removal of task groups */
@@ -377,9 +387,48 @@ struct cfs_rq {
377 387
378 unsigned long load_contribution; 388 unsigned long load_contribution;
379#endif 389#endif
390#ifdef CONFIG_CFS_BANDWIDTH
391 int runtime_enabled;
392 s64 runtime_remaining;
393#endif
380#endif 394#endif
381}; 395};
382 396
397#ifdef CONFIG_FAIR_GROUP_SCHED
398#ifdef CONFIG_CFS_BANDWIDTH
399static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
400{
401 return &tg->cfs_bandwidth;
402}
403
404static inline u64 default_cfs_period(void);
405
406static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
407{
408 raw_spin_lock_init(&cfs_b->lock);
409 cfs_b->quota = RUNTIME_INF;
410 cfs_b->period = ns_to_ktime(default_cfs_period());
411}
412
413static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
414{
415 cfs_rq->runtime_enabled = 0;
416}
417
418static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
419{}
420#else
421static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
422static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
423static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
424
425static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
426{
427 return NULL;
428}
429#endif /* CONFIG_CFS_BANDWIDTH */
430#endif /* CONFIG_FAIR_GROUP_SCHED */
431
383/* Real-Time classes' related field in a runqueue: */ 432/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 433struct rt_rq {
385 struct rt_prio_array active; 434 struct rt_prio_array active;
@@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7971 /* allow initial update_cfs_load() to truncate */ 8020 /* allow initial update_cfs_load() to truncate */
7972 cfs_rq->load_stamp = 1; 8021 cfs_rq->load_stamp = 1;
7973#endif 8022#endif
8023 init_cfs_rq_runtime(cfs_rq);
7974 8024
7975 tg->cfs_rq[cpu] = cfs_rq; 8025 tg->cfs_rq[cpu] = cfs_rq;
7976 tg->se[cpu] = se; 8026 tg->se[cpu] = se;
@@ -8110,6 +8160,7 @@ void __init sched_init(void)
8110 * We achieve this by letting root_task_group's tasks sit 8160 * We achieve this by letting root_task_group's tasks sit
8111 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8161 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8112 */ 8162 */
8163 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8113 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8164 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8114#endif /* CONFIG_FAIR_GROUP_SCHED */ 8165#endif /* CONFIG_FAIR_GROUP_SCHED */
8115 8166
@@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg)
8351{ 8402{
8352 int i; 8403 int i;
8353 8404
8405 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8406
8354 for_each_possible_cpu(i) { 8407 for_each_possible_cpu(i) {
8355 if (tg->cfs_rq) 8408 if (tg->cfs_rq)
8356 kfree(tg->cfs_rq[i]); 8409 kfree(tg->cfs_rq[i]);
@@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8378 8431
8379 tg->shares = NICE_0_LOAD; 8432 tg->shares = NICE_0_LOAD;
8380 8433
8434 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8435
8381 for_each_possible_cpu(i) { 8436 for_each_possible_cpu(i) {
8382 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8437 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8383 GFP_KERNEL, cpu_to_node(i)); 8438 GFP_KERNEL, cpu_to_node(i));
@@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8753 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8808 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8754} 8809}
8755 8810
8756static int tg_set_bandwidth(struct task_group *tg, 8811static int tg_set_rt_bandwidth(struct task_group *tg,
8757 u64 rt_period, u64 rt_runtime) 8812 u64 rt_period, u64 rt_runtime)
8758{ 8813{
8759 int i, err = 0; 8814 int i, err = 0;
@@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8792 if (rt_runtime_us < 0) 8847 if (rt_runtime_us < 0)
8793 rt_runtime = RUNTIME_INF; 8848 rt_runtime = RUNTIME_INF;
8794 8849
8795 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8850 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8796} 8851}
8797 8852
8798long sched_group_rt_runtime(struct task_group *tg) 8853long sched_group_rt_runtime(struct task_group *tg)
@@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8817 if (rt_period == 0) 8872 if (rt_period == 0)
8818 return -EINVAL; 8873 return -EINVAL;
8819 8874
8820 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8875 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8821} 8876}
8822 8877
8823long sched_group_rt_period(struct task_group *tg) 8878long sched_group_rt_period(struct task_group *tg)
@@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9007 9062
9008 return (u64) scale_load_down(tg->shares); 9063 return (u64) scale_load_down(tg->shares);
9009} 9064}
9065
9066#ifdef CONFIG_CFS_BANDWIDTH
9067const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9068const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9069
9070static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9071{
9072 int i;
9073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9074 static DEFINE_MUTEX(mutex);
9075
9076 if (tg == &root_task_group)
9077 return -EINVAL;
9078
9079 /*
9080 * Ensure we have at some amount of bandwidth every period. This is
9081 * to prevent reaching a state of large arrears when throttled via
9082 * entity_tick() resulting in prolonged exit starvation.
9083 */
9084 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9085 return -EINVAL;
9086
9087 /*
9088 * Likewise, bound things on the otherside by preventing insane quota
9089 * periods. This also allows us to normalize in computing quota
9090 * feasibility.
9091 */
9092 if (period > max_cfs_quota_period)
9093 return -EINVAL;
9094
9095 mutex_lock(&mutex);
9096 raw_spin_lock_irq(&cfs_b->lock);
9097 cfs_b->period = ns_to_ktime(period);
9098 cfs_b->quota = quota;
9099 raw_spin_unlock_irq(&cfs_b->lock);
9100
9101 for_each_possible_cpu(i) {
9102 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9103 struct rq *rq = rq_of(cfs_rq);
9104
9105 raw_spin_lock_irq(&rq->lock);
9106 cfs_rq->runtime_enabled = quota != RUNTIME_INF;
9107 cfs_rq->runtime_remaining = 0;
9108 raw_spin_unlock_irq(&rq->lock);
9109 }
9110 mutex_unlock(&mutex);
9111
9112 return 0;
9113}
9114
9115int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9116{
9117 u64 quota, period;
9118
9119 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9120 if (cfs_quota_us < 0)
9121 quota = RUNTIME_INF;
9122 else
9123 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9124
9125 return tg_set_cfs_bandwidth(tg, period, quota);
9126}
9127
9128long tg_get_cfs_quota(struct task_group *tg)
9129{
9130 u64 quota_us;
9131
9132 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9133 return -1;
9134
9135 quota_us = tg_cfs_bandwidth(tg)->quota;
9136 do_div(quota_us, NSEC_PER_USEC);
9137
9138 return quota_us;
9139}
9140
9141int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9142{
9143 u64 quota, period;
9144
9145 period = (u64)cfs_period_us * NSEC_PER_USEC;
9146 quota = tg_cfs_bandwidth(tg)->quota;
9147
9148 if (period <= 0)
9149 return -EINVAL;
9150
9151 return tg_set_cfs_bandwidth(tg, period, quota);
9152}
9153
9154long tg_get_cfs_period(struct task_group *tg)
9155{
9156 u64 cfs_period_us;
9157
9158 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9159 do_div(cfs_period_us, NSEC_PER_USEC);
9160
9161 return cfs_period_us;
9162}
9163
9164static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9165{
9166 return tg_get_cfs_quota(cgroup_tg(cgrp));
9167}
9168
9169static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9170 s64 cfs_quota_us)
9171{
9172 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9173}
9174
9175static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9176{
9177 return tg_get_cfs_period(cgroup_tg(cgrp));
9178}
9179
9180static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9181 u64 cfs_period_us)
9182{
9183 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9184}
9185
9186#endif /* CONFIG_CFS_BANDWIDTH */
9010#endif /* CONFIG_FAIR_GROUP_SCHED */ 9187#endif /* CONFIG_FAIR_GROUP_SCHED */
9011 9188
9012#ifdef CONFIG_RT_GROUP_SCHED 9189#ifdef CONFIG_RT_GROUP_SCHED
@@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = {
9041 .write_u64 = cpu_shares_write_u64, 9218 .write_u64 = cpu_shares_write_u64,
9042 }, 9219 },
9043#endif 9220#endif
9221#ifdef CONFIG_CFS_BANDWIDTH
9222 {
9223 .name = "cfs_quota_us",
9224 .read_s64 = cpu_cfs_quota_read_s64,
9225 .write_s64 = cpu_cfs_quota_write_s64,
9226 },
9227 {
9228 .name = "cfs_period_us",
9229 .read_u64 = cpu_cfs_period_read_u64,
9230 .write_u64 = cpu_cfs_period_write_u64,
9231 },
9232#endif
9044#ifdef CONFIG_RT_GROUP_SCHED 9233#ifdef CONFIG_RT_GROUP_SCHED
9045 { 9234 {
9046 .name = "rt_runtime_us", 9235 .name = "rt_runtime_us",
@@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = {
9350 .subsys_id = cpuacct_subsys_id, 9539 .subsys_id = cpuacct_subsys_id,
9351}; 9540};
9352#endif /* CONFIG_CGROUP_CPUACCT */ 9541#endif /* CONFIG_CGROUP_CPUACCT */
9353
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f86b0cb5eb29..f24f4171019d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1234,6 +1234,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1234 check_preempt_tick(cfs_rq, curr); 1234 check_preempt_tick(cfs_rq, curr);
1235} 1235}
1236 1236
1237
1238/**************************************************
1239 * CFS bandwidth control machinery
1240 */
1241
1242#ifdef CONFIG_CFS_BANDWIDTH
1243/*
1244 * default period for cfs group bandwidth.
1245 * default: 0.1s, units: nanoseconds
1246 */
1247static inline u64 default_cfs_period(void)
1248{
1249 return 100000000ULL;
1250}
1251#endif
1252
1237/************************************************** 1253/**************************************************
1238 * CFS operations on tasks: 1254 * CFS operations on tasks:
1239 */ 1255 */