diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-04-19 13:44:58 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-04-19 13:44:58 -0400 |
commit | ac086bc22997a2be24fc40fc8d46522fe7e03d11 (patch) | |
tree | 7a484ba13acbdf0fa98c896ce58e807b4b5b1af9 | |
parent | d0b27fa77854b149ad4af08b0fe47fe712a47ade (diff) |
sched: rt-group: smp balancing
Currently the rt group scheduling does a per cpu runtime limit, however
the rt load balancer makes no guarantees about an equal spread of real-
time tasks, just that at any one time, the highest priority tasks run.
Solve this by making the runtime limit a global property by borrowing
excessive runtime from the other cpus once the local limit runs out.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | kernel/sched.c | 40 | ||||
-rw-r--r-- | kernel/sched_rt.c | 88 |
2 files changed, 122 insertions, 6 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index bb20323f7d09..313cd4f057cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -164,6 +164,7 @@ struct rt_prio_array { | |||
164 | struct rt_bandwidth { | 164 | struct rt_bandwidth { |
165 | ktime_t rt_period; | 165 | ktime_t rt_period; |
166 | u64 rt_runtime; | 166 | u64 rt_runtime; |
167 | spinlock_t rt_runtime_lock; | ||
167 | struct hrtimer rt_period_timer; | 168 | struct hrtimer rt_period_timer; |
168 | }; | 169 | }; |
169 | 170 | ||
@@ -198,6 +199,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
198 | rt_b->rt_period = ns_to_ktime(period); | 199 | rt_b->rt_period = ns_to_ktime(period); |
199 | rt_b->rt_runtime = runtime; | 200 | rt_b->rt_runtime = runtime; |
200 | 201 | ||
202 | spin_lock_init(&rt_b->rt_runtime_lock); | ||
203 | |||
201 | hrtimer_init(&rt_b->rt_period_timer, | 204 | hrtimer_init(&rt_b->rt_period_timer, |
202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 205 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
203 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 206 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
@@ -414,6 +417,8 @@ struct rt_rq { | |||
414 | #endif | 417 | #endif |
415 | int rt_throttled; | 418 | int rt_throttled; |
416 | u64 rt_time; | 419 | u64 rt_time; |
420 | u64 rt_runtime; | ||
421 | spinlock_t rt_runtime_lock; | ||
417 | 422 | ||
418 | #ifdef CONFIG_RT_GROUP_SCHED | 423 | #ifdef CONFIG_RT_GROUP_SCHED |
419 | unsigned long rt_nr_boosted; | 424 | unsigned long rt_nr_boosted; |
@@ -7299,6 +7304,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7299 | 7304 | ||
7300 | rt_rq->rt_time = 0; | 7305 | rt_rq->rt_time = 0; |
7301 | rt_rq->rt_throttled = 0; | 7306 | rt_rq->rt_throttled = 0; |
7307 | rt_rq->rt_runtime = 0; | ||
7308 | spin_lock_init(&rt_rq->rt_runtime_lock); | ||
7302 | 7309 | ||
7303 | #ifdef CONFIG_RT_GROUP_SCHED | 7310 | #ifdef CONFIG_RT_GROUP_SCHED |
7304 | rt_rq->rt_nr_boosted = 0; | 7311 | rt_rq->rt_nr_boosted = 0; |
@@ -7335,6 +7342,7 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | |||
7335 | init_rt_rq(rt_rq, rq); | 7342 | init_rt_rq(rt_rq, rq); |
7336 | rt_rq->tg = tg; | 7343 | rt_rq->tg = tg; |
7337 | rt_rq->rt_se = rt_se; | 7344 | rt_rq->rt_se = rt_se; |
7345 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
7338 | if (add) | 7346 | if (add) |
7339 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7347 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7340 | 7348 | ||
@@ -7391,6 +7399,8 @@ void __init sched_init(void) | |||
7391 | init_tg_rt_entry(rq, &init_task_group, | 7399 | init_tg_rt_entry(rq, &init_task_group, |
7392 | &per_cpu(init_rt_rq, i), | 7400 | &per_cpu(init_rt_rq, i), |
7393 | &per_cpu(init_sched_rt_entity, i), i, 1); | 7401 | &per_cpu(init_sched_rt_entity, i), i, 1); |
7402 | #else | ||
7403 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
7394 | #endif | 7404 | #endif |
7395 | 7405 | ||
7396 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7406 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -7974,11 +7984,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
7974 | static int tg_set_bandwidth(struct task_group *tg, | 7984 | static int tg_set_bandwidth(struct task_group *tg, |
7975 | u64 rt_period, u64 rt_runtime) | 7985 | u64 rt_period, u64 rt_runtime) |
7976 | { | 7986 | { |
7977 | int err = 0; | 7987 | int i, err = 0; |
7978 | 7988 | ||
7979 | mutex_lock(&rt_constraints_mutex); | 7989 | mutex_lock(&rt_constraints_mutex); |
7980 | read_lock(&tasklist_lock); | 7990 | read_lock(&tasklist_lock); |
7981 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | 7991 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { |
7982 | err = -EBUSY; | 7992 | err = -EBUSY; |
7983 | goto unlock; | 7993 | goto unlock; |
7984 | } | 7994 | } |
@@ -7986,8 +7996,19 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
7986 | err = -EINVAL; | 7996 | err = -EINVAL; |
7987 | goto unlock; | 7997 | goto unlock; |
7988 | } | 7998 | } |
7999 | |||
8000 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
7989 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8001 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
7990 | tg->rt_bandwidth.rt_runtime = rt_runtime; | 8002 | tg->rt_bandwidth.rt_runtime = rt_runtime; |
8003 | |||
8004 | for_each_possible_cpu(i) { | ||
8005 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
8006 | |||
8007 | spin_lock(&rt_rq->rt_runtime_lock); | ||
8008 | rt_rq->rt_runtime = rt_runtime; | ||
8009 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
8010 | } | ||
8011 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
7991 | unlock: | 8012 | unlock: |
7992 | read_unlock(&tasklist_lock); | 8013 | read_unlock(&tasklist_lock); |
7993 | mutex_unlock(&rt_constraints_mutex); | 8014 | mutex_unlock(&rt_constraints_mutex); |
@@ -8052,6 +8073,19 @@ static int sched_rt_global_constraints(void) | |||
8052 | #else | 8073 | #else |
8053 | static int sched_rt_global_constraints(void) | 8074 | static int sched_rt_global_constraints(void) |
8054 | { | 8075 | { |
8076 | unsigned long flags; | ||
8077 | int i; | ||
8078 | |||
8079 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
8080 | for_each_possible_cpu(i) { | ||
8081 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
8082 | |||
8083 | spin_lock(&rt_rq->rt_runtime_lock); | ||
8084 | rt_rq->rt_runtime = global_rt_runtime(); | ||
8085 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
8086 | } | ||
8087 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
8088 | |||
8055 | return 0; | 8089 | return 0; |
8056 | } | 8090 | } |
8057 | #endif | 8091 | #endif |
@@ -8168,7 +8202,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
8168 | #endif | 8202 | #endif |
8169 | 8203 | ||
8170 | #ifdef CONFIG_RT_GROUP_SCHED | 8204 | #ifdef CONFIG_RT_GROUP_SCHED |
8171 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8205 | static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
8172 | struct file *file, | 8206 | struct file *file, |
8173 | const char __user *userbuf, | 8207 | const char __user *userbuf, |
8174 | size_t nbytes, loff_t *unused_ppos) | 8208 | size_t nbytes, loff_t *unused_ppos) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8bc176136666..6928ded24da1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | |||
62 | if (!rt_rq->tg) | 62 | if (!rt_rq->tg) |
63 | return RUNTIME_INF; | 63 | return RUNTIME_INF; |
64 | 64 | ||
65 | return rt_rq->tg->rt_bandwidth.rt_runtime; | 65 | return rt_rq->rt_runtime; |
66 | } | ||
67 | |||
68 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
69 | { | ||
70 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | ||
66 | } | 71 | } |
67 | 72 | ||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 73 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -145,11 +150,21 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | |||
145 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; | 150 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; |
146 | } | 151 | } |
147 | 152 | ||
153 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
154 | { | ||
155 | return &rt_rq->tg->rt_bandwidth; | ||
156 | } | ||
157 | |||
148 | #else | 158 | #else |
149 | 159 | ||
150 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
151 | { | 161 | { |
152 | return def_rt_bandwidth.rt_runtime; | 162 | return rt_rq->rt_runtime; |
163 | } | ||
164 | |||
165 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
166 | { | ||
167 | return ktime_to_ns(def_rt_bandwidth.rt_period); | ||
153 | } | 168 | } |
154 | 169 | ||
155 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 170 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -200,6 +215,11 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | |||
200 | return &cpu_rq(cpu)->rt; | 215 | return &cpu_rq(cpu)->rt; |
201 | } | 216 | } |
202 | 217 | ||
218 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
219 | { | ||
220 | return &def_rt_bandwidth; | ||
221 | } | ||
222 | |||
203 | #endif | 223 | #endif |
204 | 224 | ||
205 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
@@ -218,8 +238,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
218 | 238 | ||
219 | spin_lock(&rq->lock); | 239 | spin_lock(&rq->lock); |
220 | if (rt_rq->rt_time) { | 240 | if (rt_rq->rt_time) { |
221 | u64 runtime = rt_b->rt_runtime; | 241 | u64 runtime; |
222 | 242 | ||
243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
244 | runtime = rt_rq->rt_runtime; | ||
223 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | 245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); |
224 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
225 | rt_rq->rt_throttled = 0; | 247 | rt_rq->rt_throttled = 0; |
@@ -227,6 +249,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
227 | } | 249 | } |
228 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
229 | idle = 0; | 251 | idle = 0; |
252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
230 | } | 253 | } |
231 | 254 | ||
232 | if (enqueue) | 255 | if (enqueue) |
@@ -237,6 +260,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
237 | return idle; | 260 | return idle; |
238 | } | 261 | } |
239 | 262 | ||
263 | #ifdef CONFIG_SMP | ||
264 | static int balance_runtime(struct rt_rq *rt_rq) | ||
265 | { | ||
266 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
267 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | ||
268 | int i, weight, more = 0; | ||
269 | u64 rt_period; | ||
270 | |||
271 | weight = cpus_weight(rd->span); | ||
272 | |||
273 | spin_lock(&rt_b->rt_runtime_lock); | ||
274 | rt_period = ktime_to_ns(rt_b->rt_period); | ||
275 | for_each_cpu_mask(i, rd->span) { | ||
276 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
277 | s64 diff; | ||
278 | |||
279 | if (iter == rt_rq) | ||
280 | continue; | ||
281 | |||
282 | spin_lock(&iter->rt_runtime_lock); | ||
283 | diff = iter->rt_runtime - iter->rt_time; | ||
284 | if (diff > 0) { | ||
285 | do_div(diff, weight); | ||
286 | if (rt_rq->rt_runtime + diff > rt_period) | ||
287 | diff = rt_period - rt_rq->rt_runtime; | ||
288 | iter->rt_runtime -= diff; | ||
289 | rt_rq->rt_runtime += diff; | ||
290 | more = 1; | ||
291 | if (rt_rq->rt_runtime == rt_period) { | ||
292 | spin_unlock(&iter->rt_runtime_lock); | ||
293 | break; | ||
294 | } | ||
295 | } | ||
296 | spin_unlock(&iter->rt_runtime_lock); | ||
297 | } | ||
298 | spin_unlock(&rt_b->rt_runtime_lock); | ||
299 | |||
300 | return more; | ||
301 | } | ||
302 | #endif | ||
303 | |||
240 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 304 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
241 | { | 305 | { |
242 | #ifdef CONFIG_RT_GROUP_SCHED | 306 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -259,6 +323,22 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
259 | if (rt_rq->rt_throttled) | 323 | if (rt_rq->rt_throttled) |
260 | return rt_rq_throttled(rt_rq); | 324 | return rt_rq_throttled(rt_rq); |
261 | 325 | ||
326 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | ||
327 | return 0; | ||
328 | |||
329 | #ifdef CONFIG_SMP | ||
330 | if (rt_rq->rt_time > runtime) { | ||
331 | int more; | ||
332 | |||
333 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
334 | more = balance_runtime(rt_rq); | ||
335 | spin_lock(&rt_rq->rt_runtime_lock); | ||
336 | |||
337 | if (more) | ||
338 | runtime = sched_rt_runtime(rt_rq); | ||
339 | } | ||
340 | #endif | ||
341 | |||
262 | if (rt_rq->rt_time > runtime) { | 342 | if (rt_rq->rt_time > runtime) { |
263 | rt_rq->rt_throttled = 1; | 343 | rt_rq->rt_throttled = 1; |
264 | if (rt_rq_throttled(rt_rq)) { | 344 | if (rt_rq_throttled(rt_rq)) { |
@@ -294,9 +374,11 @@ static void update_curr_rt(struct rq *rq) | |||
294 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
295 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
296 | 376 | ||
377 | spin_lock(&rt_rq->rt_runtime_lock); | ||
297 | rt_rq->rt_time += delta_exec; | 378 | rt_rq->rt_time += delta_exec; |
298 | if (sched_rt_runtime_exceeded(rt_rq)) | 379 | if (sched_rt_runtime_exceeded(rt_rq)) |
299 | resched_task(curr); | 380 | resched_task(curr); |
381 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
300 | } | 382 | } |
301 | 383 | ||
302 | static inline | 384 | static inline |