diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 18:40:24 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 18:40:24 -0400 |
commit | ec965350bb98bd291eb34f6ecddfdcfc36da1e6e (patch) | |
tree | 983bcaf33ed00b48a86f7f8790cc460cf15dd252 /kernel/sched_fair.c | |
parent | 5f033bb9bc5cb3bb37a79e3ef131f50ecdcb72b0 (diff) | |
parent | 486fdae21458bd9f4e125099bb3c38a4064e450e (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel: (62 commits)
sched: build fix
sched: better rt-group documentation
sched: features fix
sched: /debug/sched_features
sched: add SCHED_FEAT_DEADLINE
sched: debug: show a weight tree
sched: fair: weight calculations
sched: fair-group: de-couple load-balancing from the rb-trees
sched: fair-group scheduling vs latency
sched: rt-group: optimize dequeue_rt_stack
sched: debug: add some debug code to handle the full hierarchy
sched: fair-group: SMP-nice for group scheduling
sched, cpuset: customize sched domains, core
sched, cpuset: customize sched domains, docs
sched: prepatory code movement
sched: rt: multi level group constraints
sched: task_group hierarchy
sched: fix the task_group hierarchy for UID grouping
sched: allow the group scheduler to have multiple levels
sched: mix tasks and groups
...
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 580 |
1 files changed, 377 insertions, 203 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0080968d3e4a..89fa32b4edf2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; | |||
62 | unsigned int __read_mostly sysctl_sched_compat_yield; | 62 | unsigned int __read_mostly sysctl_sched_compat_yield; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * SCHED_BATCH wake-up granularity. | ||
66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
67 | * | ||
68 | * This option delays the preemption effects of decoupled workloads | ||
69 | * and reduces their over-scheduling. Synchronous workloads will still | ||
70 | * have immediate wakeup/sleep latencies. | ||
71 | */ | ||
72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | ||
73 | |||
74 | /* | ||
75 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
77 | * | 67 | * |
78 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
79 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
80 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
81 | */ | 71 | */ |
82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
83 | 73 | ||
84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
85 | 75 | ||
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
87 | * CFS operations on generic schedulable entities: | 77 | * CFS operations on generic schedulable entities: |
88 | */ | 78 | */ |
89 | 79 | ||
80 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
81 | { | ||
82 | return container_of(se, struct task_struct, se); | ||
83 | } | ||
84 | |||
90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 85 | #ifdef CONFIG_FAIR_GROUP_SCHED |
91 | 86 | ||
92 | /* cpu runqueue to which this cfs_rq is attached */ | 87 | /* cpu runqueue to which this cfs_rq is attached */ |
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
98 | /* An entity is a task if it doesn't "own" a runqueue */ | 93 | /* An entity is a task if it doesn't "own" a runqueue */ |
99 | #define entity_is_task(se) (!se->my_q) | 94 | #define entity_is_task(se) (!se->my_q) |
100 | 95 | ||
96 | /* Walk up scheduling entities hierarchy */ | ||
97 | #define for_each_sched_entity(se) \ | ||
98 | for (; se; se = se->parent) | ||
99 | |||
100 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
101 | { | ||
102 | return p->se.cfs_rq; | ||
103 | } | ||
104 | |||
105 | /* runqueue on which this entity is (to be) queued */ | ||
106 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
107 | { | ||
108 | return se->cfs_rq; | ||
109 | } | ||
110 | |||
111 | /* runqueue "owned" by this group */ | ||
112 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
113 | { | ||
114 | return grp->my_q; | ||
115 | } | ||
116 | |||
117 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
118 | * another cpu ('this_cpu') | ||
119 | */ | ||
120 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
121 | { | ||
122 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
123 | } | ||
124 | |||
125 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
126 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
127 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
128 | |||
129 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
130 | static inline int | ||
131 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
132 | { | ||
133 | if (se->cfs_rq == pse->cfs_rq) | ||
134 | return 1; | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
140 | { | ||
141 | return se->parent; | ||
142 | } | ||
143 | |||
101 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 144 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
102 | 145 | ||
103 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 146 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
107 | 150 | ||
108 | #define entity_is_task(se) 1 | 151 | #define entity_is_task(se) 1 |
109 | 152 | ||
110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 153 | #define for_each_sched_entity(se) \ |
154 | for (; se; se = NULL) | ||
111 | 155 | ||
112 | static inline struct task_struct *task_of(struct sched_entity *se) | 156 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
113 | { | 157 | { |
114 | return container_of(se, struct task_struct, se); | 158 | return &task_rq(p)->cfs; |
159 | } | ||
160 | |||
161 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
162 | { | ||
163 | struct task_struct *p = task_of(se); | ||
164 | struct rq *rq = task_rq(p); | ||
165 | |||
166 | return &rq->cfs; | ||
167 | } | ||
168 | |||
169 | /* runqueue "owned" by this group */ | ||
170 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | |||
175 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
176 | { | ||
177 | return &cpu_rq(this_cpu)->cfs; | ||
178 | } | ||
179 | |||
180 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
181 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
182 | |||
183 | static inline int | ||
184 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
185 | { | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
190 | { | ||
191 | return NULL; | ||
115 | } | 192 | } |
116 | 193 | ||
194 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
195 | |||
117 | 196 | ||
118 | /************************************************************** | 197 | /************************************************************** |
119 | * Scheduling class tree data structure manipulation methods: | 198 | * Scheduling class tree data structure manipulation methods: |
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
255 | #endif | 334 | #endif |
256 | 335 | ||
257 | /* | 336 | /* |
337 | * delta *= w / rw | ||
338 | */ | ||
339 | static inline unsigned long | ||
340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
341 | { | ||
342 | for_each_sched_entity(se) { | ||
343 | delta = calc_delta_mine(delta, | ||
344 | se->load.weight, &cfs_rq_of(se)->load); | ||
345 | } | ||
346 | |||
347 | return delta; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * delta *= rw / w | ||
352 | */ | ||
353 | static inline unsigned long | ||
354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
355 | { | ||
356 | for_each_sched_entity(se) { | ||
357 | delta = calc_delta_mine(delta, | ||
358 | cfs_rq_of(se)->load.weight, &se->load); | ||
359 | } | ||
360 | |||
361 | return delta; | ||
362 | } | ||
363 | |||
364 | /* | ||
258 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
259 | * | 366 | * |
260 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) | |||
283 | */ | 390 | */ |
284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
285 | { | 392 | { |
286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
287 | se->load.weight, &cfs_rq->load); | ||
288 | } | 394 | } |
289 | 395 | ||
290 | /* | 396 | /* |
291 | * We calculate the vruntime slice. | 397 | * We calculate the vruntime slice of a to be inserted task |
292 | * | 398 | * |
293 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
294 | */ | 400 | */ |
295 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
296 | { | 402 | { |
297 | u64 vslice = __sched_period(nr_running); | 403 | unsigned long nr_running = cfs_rq->nr_running; |
298 | 404 | ||
299 | vslice *= NICE_0_LOAD; | 405 | if (!se->on_rq) |
300 | do_div(vslice, rq_weight); | 406 | nr_running++; |
301 | 407 | ||
302 | return vslice; | 408 | return __sched_period(nr_running); |
303 | } | 409 | } |
304 | 410 | ||
305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 411 | /* |
412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
413 | * that it favours >=0 over <0. | ||
414 | * | ||
415 | * -20 | | ||
416 | * | | ||
417 | * 0 --------+------- | ||
418 | * .' | ||
419 | * 19 .' | ||
420 | * | ||
421 | */ | ||
422 | static unsigned long | ||
423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
306 | { | 424 | { |
307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 425 | struct load_weight lw = { |
308 | cfs_rq->nr_running + 1); | 426 | .weight = NICE_0_LOAD, |
427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
428 | }; | ||
429 | |||
430 | for_each_sched_entity(se) { | ||
431 | struct load_weight *se_lw = &se->load; | ||
432 | |||
433 | if (se->load.weight < NICE_0_LOAD) | ||
434 | se_lw = &lw; | ||
435 | |||
436 | delta = calc_delta_mine(delta, | ||
437 | cfs_rq_of(se)->load.weight, se_lw); | ||
438 | } | ||
439 | |||
440 | return delta; | ||
309 | } | 441 | } |
310 | 442 | ||
311 | /* | 443 | /* |
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
322 | 454 | ||
323 | curr->sum_exec_runtime += delta_exec; | 455 | curr->sum_exec_runtime += delta_exec; |
324 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
325 | delta_exec_weighted = delta_exec; | 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
326 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
327 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
328 | &curr->load); | ||
329 | } | ||
330 | curr->vruntime += delta_exec_weighted; | 458 | curr->vruntime += delta_exec_weighted; |
331 | } | 459 | } |
332 | 460 | ||
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
413 | * Scheduling class queueing methods: | 541 | * Scheduling class queueing methods: |
414 | */ | 542 | */ |
415 | 543 | ||
544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
545 | static void | ||
546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
547 | { | ||
548 | cfs_rq->task_weight += weight; | ||
549 | } | ||
550 | #else | ||
551 | static inline void | ||
552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
553 | { | ||
554 | } | ||
555 | #endif | ||
556 | |||
416 | static void | 557 | static void |
417 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
418 | { | 559 | { |
419 | update_load_add(&cfs_rq->load, se->load.weight); | 560 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | if (!parent_entity(se)) | ||
562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
563 | if (entity_is_task(se)) | ||
564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
420 | cfs_rq->nr_running++; | 565 | cfs_rq->nr_running++; |
421 | se->on_rq = 1; | 566 | se->on_rq = 1; |
567 | list_add(&se->group_node, &cfs_rq->tasks); | ||
422 | } | 568 | } |
423 | 569 | ||
424 | static void | 570 | static void |
425 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
426 | { | 572 | { |
427 | update_load_sub(&cfs_rq->load, se->load.weight); | 573 | update_load_sub(&cfs_rq->load, se->load.weight); |
574 | if (!parent_entity(se)) | ||
575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
576 | if (entity_is_task(se)) | ||
577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
428 | cfs_rq->nr_running--; | 578 | cfs_rq->nr_running--; |
429 | se->on_rq = 0; | 579 | se->on_rq = 0; |
580 | list_del_init(&se->group_node); | ||
430 | } | 581 | } |
431 | 582 | ||
432 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 583 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -510,8 +661,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
510 | 661 | ||
511 | if (!initial) { | 662 | if (!initial) { |
512 | /* sleeps upto a single latency don't count. */ | 663 | /* sleeps upto a single latency don't count. */ |
513 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
514 | vruntime -= sysctl_sched_latency; | 665 | if (sched_feat(NORMALIZED_SLEEPER)) |
666 | vruntime -= calc_delta_weight(sysctl_sched_latency, se); | ||
667 | else | ||
668 | vruntime -= sysctl_sched_latency; | ||
669 | } | ||
515 | 670 | ||
516 | /* ensure we never gain time by being placed backwards. */ | 671 | /* ensure we never gain time by being placed backwards. */ |
517 | vruntime = max_vruntime(se->vruntime, vruntime); | 672 | vruntime = max_vruntime(se->vruntime, vruntime); |
@@ -627,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
627 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 782 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
628 | } | 783 | } |
629 | 784 | ||
785 | static int | ||
786 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
787 | |||
630 | static struct sched_entity * | 788 | static struct sched_entity * |
631 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 789 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
632 | { | 790 | { |
633 | s64 diff, gran; | ||
634 | |||
635 | if (!cfs_rq->next) | 791 | if (!cfs_rq->next) |
636 | return se; | 792 | return se; |
637 | 793 | ||
638 | diff = cfs_rq->next->vruntime - se->vruntime; | 794 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) |
639 | if (diff < 0) | ||
640 | return se; | ||
641 | |||
642 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
643 | if (diff > gran) | ||
644 | return se; | 795 | return se; |
645 | 796 | ||
646 | return cfs_rq->next; | 797 | return cfs_rq->next; |
@@ -708,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
708 | * CFS operations on tasks: | 859 | * CFS operations on tasks: |
709 | */ | 860 | */ |
710 | 861 | ||
711 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
712 | |||
713 | /* Walk up scheduling entities hierarchy */ | ||
714 | #define for_each_sched_entity(se) \ | ||
715 | for (; se; se = se->parent) | ||
716 | |||
717 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
718 | { | ||
719 | return p->se.cfs_rq; | ||
720 | } | ||
721 | |||
722 | /* runqueue on which this entity is (to be) queued */ | ||
723 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
724 | { | ||
725 | return se->cfs_rq; | ||
726 | } | ||
727 | |||
728 | /* runqueue "owned" by this group */ | ||
729 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
730 | { | ||
731 | return grp->my_q; | ||
732 | } | ||
733 | |||
734 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
735 | * another cpu ('this_cpu') | ||
736 | */ | ||
737 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
738 | { | ||
739 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
740 | } | ||
741 | |||
742 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
743 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
744 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
745 | |||
746 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
747 | static inline int | ||
748 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
749 | { | ||
750 | if (se->cfs_rq == pse->cfs_rq) | ||
751 | return 1; | ||
752 | |||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
757 | { | ||
758 | return se->parent; | ||
759 | } | ||
760 | |||
761 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
762 | |||
763 | #define for_each_sched_entity(se) \ | ||
764 | for (; se; se = NULL) | ||
765 | |||
766 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
767 | { | ||
768 | return &task_rq(p)->cfs; | ||
769 | } | ||
770 | |||
771 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
772 | { | ||
773 | struct task_struct *p = task_of(se); | ||
774 | struct rq *rq = task_rq(p); | ||
775 | |||
776 | return &rq->cfs; | ||
777 | } | ||
778 | |||
779 | /* runqueue "owned" by this group */ | ||
780 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
781 | { | ||
782 | return NULL; | ||
783 | } | ||
784 | |||
785 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
786 | { | ||
787 | return &cpu_rq(this_cpu)->cfs; | ||
788 | } | ||
789 | |||
790 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
791 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
792 | |||
793 | static inline int | ||
794 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
795 | { | ||
796 | return 1; | ||
797 | } | ||
798 | |||
799 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
800 | { | ||
801 | return NULL; | ||
802 | } | ||
803 | |||
804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
805 | |||
806 | #ifdef CONFIG_SCHED_HRTICK | 862 | #ifdef CONFIG_SCHED_HRTICK |
807 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | 863 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
808 | { | 864 | { |
@@ -916,7 +972,7 @@ static void yield_task_fair(struct rq *rq) | |||
916 | /* | 972 | /* |
917 | * Already in the rightmost position? | 973 | * Already in the rightmost position? |
918 | */ | 974 | */ |
919 | if (unlikely(rightmost->vruntime < se->vruntime)) | 975 | if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) |
920 | return; | 976 | return; |
921 | 977 | ||
922 | /* | 978 | /* |
@@ -955,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
955 | return cpu; | 1011 | return cpu; |
956 | 1012 | ||
957 | for_each_domain(cpu, sd) { | 1013 | for_each_domain(cpu, sd) { |
958 | if (sd->flags & SD_WAKE_IDLE) { | 1014 | if ((sd->flags & SD_WAKE_IDLE) |
1015 | || ((sd->flags & SD_WAKE_IDLE_FAR) | ||
1016 | && !task_hot(p, task_rq(p)->clock, sd))) { | ||
959 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1017 | cpus_and(tmp, sd->span, p->cpus_allowed); |
960 | for_each_cpu_mask(i, tmp) { | 1018 | for_each_cpu_mask(i, tmp) { |
961 | if (idle_cpu(i)) { | 1019 | if (idle_cpu(i)) { |
@@ -1099,6 +1157,58 @@ out: | |||
1099 | } | 1157 | } |
1100 | #endif /* CONFIG_SMP */ | 1158 | #endif /* CONFIG_SMP */ |
1101 | 1159 | ||
1160 | static unsigned long wakeup_gran(struct sched_entity *se) | ||
1161 | { | ||
1162 | unsigned long gran = sysctl_sched_wakeup_granularity; | ||
1163 | |||
1164 | /* | ||
1165 | * More easily preempt - nice tasks, while not making it harder for | ||
1166 | * + nice tasks. | ||
1167 | */ | ||
1168 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | ||
1169 | |||
1170 | return gran; | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1174 | * Should 'se' preempt 'curr'. | ||
1175 | * | ||
1176 | * |s1 | ||
1177 | * |s2 | ||
1178 | * |s3 | ||
1179 | * g | ||
1180 | * |<--->|c | ||
1181 | * | ||
1182 | * w(c, s1) = -1 | ||
1183 | * w(c, s2) = 0 | ||
1184 | * w(c, s3) = 1 | ||
1185 | * | ||
1186 | */ | ||
1187 | static int | ||
1188 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
1189 | { | ||
1190 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
1191 | |||
1192 | if (vdiff < 0) | ||
1193 | return -1; | ||
1194 | |||
1195 | gran = wakeup_gran(curr); | ||
1196 | if (vdiff > gran) | ||
1197 | return 1; | ||
1198 | |||
1199 | return 0; | ||
1200 | } | ||
1201 | |||
1202 | /* return depth at which a sched entity is present in the hierarchy */ | ||
1203 | static inline int depth_se(struct sched_entity *se) | ||
1204 | { | ||
1205 | int depth = 0; | ||
1206 | |||
1207 | for_each_sched_entity(se) | ||
1208 | depth++; | ||
1209 | |||
1210 | return depth; | ||
1211 | } | ||
1102 | 1212 | ||
1103 | /* | 1213 | /* |
1104 | * Preempt the current task with a newly woken task if needed: | 1214 | * Preempt the current task with a newly woken task if needed: |
@@ -1108,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1108 | struct task_struct *curr = rq->curr; | 1218 | struct task_struct *curr = rq->curr; |
1109 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1219 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1110 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1220 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1111 | unsigned long gran; | 1221 | int se_depth, pse_depth; |
1112 | 1222 | ||
1113 | if (unlikely(rt_prio(p->prio))) { | 1223 | if (unlikely(rt_prio(p->prio))) { |
1114 | update_rq_clock(rq); | 1224 | update_rq_clock(rq); |
@@ -1133,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1133 | if (!sched_feat(WAKEUP_PREEMPT)) | 1243 | if (!sched_feat(WAKEUP_PREEMPT)) |
1134 | return; | 1244 | return; |
1135 | 1245 | ||
1136 | while (!is_same_group(se, pse)) { | 1246 | /* |
1247 | * preemption test can be made between sibling entities who are in the | ||
1248 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
1249 | * both tasks until we find their ancestors who are siblings of common | ||
1250 | * parent. | ||
1251 | */ | ||
1252 | |||
1253 | /* First walk up until both entities are at same depth */ | ||
1254 | se_depth = depth_se(se); | ||
1255 | pse_depth = depth_se(pse); | ||
1256 | |||
1257 | while (se_depth > pse_depth) { | ||
1258 | se_depth--; | ||
1137 | se = parent_entity(se); | 1259 | se = parent_entity(se); |
1260 | } | ||
1261 | |||
1262 | while (pse_depth > se_depth) { | ||
1263 | pse_depth--; | ||
1138 | pse = parent_entity(pse); | 1264 | pse = parent_entity(pse); |
1139 | } | 1265 | } |
1140 | 1266 | ||
1141 | gran = sysctl_sched_wakeup_granularity; | 1267 | while (!is_same_group(se, pse)) { |
1142 | /* | 1268 | se = parent_entity(se); |
1143 | * More easily preempt - nice tasks, while not making | 1269 | pse = parent_entity(pse); |
1144 | * it harder for + nice tasks. | 1270 | } |
1145 | */ | ||
1146 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
1147 | gran = calc_delta_fair(gran, &se->load); | ||
1148 | 1271 | ||
1149 | if (pse->vruntime + gran < se->vruntime) | 1272 | if (wakeup_preempt_entity(se, pse) == 1) |
1150 | resched_task(curr); | 1273 | resched_task(curr); |
1151 | } | 1274 | } |
1152 | 1275 | ||
@@ -1197,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1197 | * the current task: | 1320 | * the current task: |
1198 | */ | 1321 | */ |
1199 | static struct task_struct * | 1322 | static struct task_struct * |
1200 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1323 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) |
1201 | { | 1324 | { |
1202 | struct task_struct *p; | 1325 | struct task_struct *p = NULL; |
1326 | struct sched_entity *se; | ||
1327 | |||
1328 | if (next == &cfs_rq->tasks) | ||
1329 | return NULL; | ||
1330 | |||
1331 | /* Skip over entities that are not tasks */ | ||
1332 | do { | ||
1333 | se = list_entry(next, struct sched_entity, group_node); | ||
1334 | next = next->next; | ||
1335 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
1203 | 1336 | ||
1204 | if (!curr) | 1337 | if (next == &cfs_rq->tasks) |
1205 | return NULL; | 1338 | return NULL; |
1206 | 1339 | ||
1207 | p = rb_entry(curr, struct task_struct, se.run_node); | 1340 | cfs_rq->balance_iterator = next; |
1208 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1341 | |
1342 | if (entity_is_task(se)) | ||
1343 | p = task_of(se); | ||
1209 | 1344 | ||
1210 | return p; | 1345 | return p; |
1211 | } | 1346 | } |
@@ -1214,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) | |||
1214 | { | 1349 | { |
1215 | struct cfs_rq *cfs_rq = arg; | 1350 | struct cfs_rq *cfs_rq = arg; |
1216 | 1351 | ||
1217 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 1352 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); |
1218 | } | 1353 | } |
1219 | 1354 | ||
1220 | static struct task_struct *load_balance_next_fair(void *arg) | 1355 | static struct task_struct *load_balance_next_fair(void *arg) |
1221 | { | 1356 | { |
1222 | struct cfs_rq *cfs_rq = arg; | 1357 | struct cfs_rq *cfs_rq = arg; |
1223 | 1358 | ||
1224 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1359 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1225 | } | 1360 | } |
1226 | 1361 | ||
1227 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1362 | static unsigned long |
1228 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1363 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1364 | unsigned long max_load_move, struct sched_domain *sd, | ||
1365 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1366 | struct cfs_rq *cfs_rq) | ||
1229 | { | 1367 | { |
1230 | struct sched_entity *curr; | 1368 | struct rq_iterator cfs_rq_iterator; |
1231 | struct task_struct *p; | ||
1232 | |||
1233 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1234 | return MAX_PRIO; | ||
1235 | |||
1236 | curr = cfs_rq->curr; | ||
1237 | if (!curr) | ||
1238 | curr = __pick_next_entity(cfs_rq); | ||
1239 | 1369 | ||
1240 | p = task_of(curr); | 1370 | cfs_rq_iterator.start = load_balance_start_fair; |
1371 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1372 | cfs_rq_iterator.arg = cfs_rq; | ||
1241 | 1373 | ||
1242 | return p->prio; | 1374 | return balance_tasks(this_rq, this_cpu, busiest, |
1375 | max_load_move, sd, idle, all_pinned, | ||
1376 | this_best_prio, &cfs_rq_iterator); | ||
1243 | } | 1377 | } |
1244 | #endif | ||
1245 | 1378 | ||
1379 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1246 | static unsigned long | 1380 | static unsigned long |
1247 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1381 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1248 | unsigned long max_load_move, | 1382 | unsigned long max_load_move, |
1249 | struct sched_domain *sd, enum cpu_idle_type idle, | 1383 | struct sched_domain *sd, enum cpu_idle_type idle, |
1250 | int *all_pinned, int *this_best_prio) | 1384 | int *all_pinned, int *this_best_prio) |
1251 | { | 1385 | { |
1252 | struct cfs_rq *busy_cfs_rq; | ||
1253 | long rem_load_move = max_load_move; | 1386 | long rem_load_move = max_load_move; |
1254 | struct rq_iterator cfs_rq_iterator; | 1387 | int busiest_cpu = cpu_of(busiest); |
1255 | 1388 | struct task_group *tg; | |
1256 | cfs_rq_iterator.start = load_balance_start_fair; | ||
1257 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1258 | 1389 | ||
1259 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1390 | rcu_read_lock(); |
1260 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1391 | list_for_each_entry(tg, &task_groups, list) { |
1261 | struct cfs_rq *this_cfs_rq; | ||
1262 | long imbalance; | 1392 | long imbalance; |
1263 | unsigned long maxload; | 1393 | unsigned long this_weight, busiest_weight; |
1394 | long rem_load, max_load, moved_load; | ||
1395 | |||
1396 | /* | ||
1397 | * empty group | ||
1398 | */ | ||
1399 | if (!aggregate(tg, sd)->task_weight) | ||
1400 | continue; | ||
1401 | |||
1402 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1403 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1404 | |||
1405 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
1406 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1407 | |||
1408 | imbalance = (busiest_weight - this_weight) / 2; | ||
1264 | 1409 | ||
1265 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1410 | if (imbalance < 0) |
1411 | imbalance = busiest_weight; | ||
1266 | 1412 | ||
1267 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1413 | max_load = max(rem_load, imbalance); |
1268 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1414 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
1269 | if (imbalance <= 0) | 1415 | max_load, sd, idle, all_pinned, this_best_prio, |
1416 | tg->cfs_rq[busiest_cpu]); | ||
1417 | |||
1418 | if (!moved_load) | ||
1270 | continue; | 1419 | continue; |
1271 | 1420 | ||
1272 | /* Don't pull more than imbalance/2 */ | 1421 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1273 | imbalance /= 2; | ||
1274 | maxload = min(rem_load_move, imbalance); | ||
1275 | 1422 | ||
1276 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1423 | moved_load *= aggregate(tg, sd)->load; |
1277 | #else | 1424 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1278 | # define maxload rem_load_move | ||
1279 | #endif | ||
1280 | /* | ||
1281 | * pass busy_cfs_rq argument into | ||
1282 | * load_balance_[start|next]_fair iterators | ||
1283 | */ | ||
1284 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1285 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1286 | maxload, sd, idle, all_pinned, | ||
1287 | this_best_prio, | ||
1288 | &cfs_rq_iterator); | ||
1289 | 1425 | ||
1290 | if (rem_load_move <= 0) | 1426 | rem_load_move -= moved_load; |
1427 | if (rem_load_move < 0) | ||
1291 | break; | 1428 | break; |
1292 | } | 1429 | } |
1430 | rcu_read_unlock(); | ||
1293 | 1431 | ||
1294 | return max_load_move - rem_load_move; | 1432 | return max_load_move - rem_load_move; |
1295 | } | 1433 | } |
1434 | #else | ||
1435 | static unsigned long | ||
1436 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1437 | unsigned long max_load_move, | ||
1438 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1439 | int *all_pinned, int *this_best_prio) | ||
1440 | { | ||
1441 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1442 | max_load_move, sd, idle, all_pinned, | ||
1443 | this_best_prio, &busiest->cfs); | ||
1444 | } | ||
1445 | #endif | ||
1296 | 1446 | ||
1297 | static int | 1447 | static int |
1298 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1448 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
@@ -1461,16 +1611,40 @@ static const struct sched_class fair_sched_class = { | |||
1461 | }; | 1611 | }; |
1462 | 1612 | ||
1463 | #ifdef CONFIG_SCHED_DEBUG | 1613 | #ifdef CONFIG_SCHED_DEBUG |
1614 | static void | ||
1615 | print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) | ||
1616 | { | ||
1617 | struct sched_entity *se; | ||
1618 | |||
1619 | if (!cfs_rq) | ||
1620 | return; | ||
1621 | |||
1622 | list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { | ||
1623 | int i; | ||
1624 | |||
1625 | for (i = depth; i; i--) | ||
1626 | seq_puts(m, " "); | ||
1627 | |||
1628 | seq_printf(m, "%lu %s %lu\n", | ||
1629 | se->load.weight, | ||
1630 | entity_is_task(se) ? "T" : "G", | ||
1631 | calc_delta_weight(SCHED_LOAD_SCALE, se) | ||
1632 | ); | ||
1633 | if (!entity_is_task(se)) | ||
1634 | print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1464 | static void print_cfs_stats(struct seq_file *m, int cpu) | 1638 | static void print_cfs_stats(struct seq_file *m, int cpu) |
1465 | { | 1639 | { |
1466 | struct cfs_rq *cfs_rq; | 1640 | struct cfs_rq *cfs_rq; |
1467 | 1641 | ||
1468 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1469 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1470 | #endif | ||
1471 | rcu_read_lock(); | 1642 | rcu_read_lock(); |
1472 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1643 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1473 | print_cfs_rq(m, cpu, cfs_rq); | 1644 | print_cfs_rq(m, cpu, cfs_rq); |
1645 | |||
1646 | seq_printf(m, "\nWeight tree:\n"); | ||
1647 | print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); | ||
1474 | rcu_read_unlock(); | 1648 | rcu_read_unlock(); |
1475 | } | 1649 | } |
1476 | #endif | 1650 | #endif |