diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 3170 |
1 files changed, 732 insertions, 2438 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index adb5e923cc61..5e3c509e0efe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <linux/debugfs.h> | 71 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 72 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | 73 | #include <linux/ftrace.h> |
74 | #include <linux/slab.h> | ||
74 | 75 | ||
75 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
@@ -144,7 +145,7 @@ struct rt_prio_array { | |||
144 | 145 | ||
145 | struct rt_bandwidth { | 146 | struct rt_bandwidth { |
146 | /* nests inside the rq lock: */ | 147 | /* nests inside the rq lock: */ |
147 | spinlock_t rt_runtime_lock; | 148 | raw_spinlock_t rt_runtime_lock; |
148 | ktime_t rt_period; | 149 | ktime_t rt_period; |
149 | u64 rt_runtime; | 150 | u64 rt_runtime; |
150 | struct hrtimer rt_period_timer; | 151 | struct hrtimer rt_period_timer; |
@@ -181,7 +182,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
181 | rt_b->rt_period = ns_to_ktime(period); | 182 | rt_b->rt_period = ns_to_ktime(period); |
182 | rt_b->rt_runtime = runtime; | 183 | rt_b->rt_runtime = runtime; |
183 | 184 | ||
184 | spin_lock_init(&rt_b->rt_runtime_lock); | 185 | raw_spin_lock_init(&rt_b->rt_runtime_lock); |
185 | 186 | ||
186 | hrtimer_init(&rt_b->rt_period_timer, | 187 | hrtimer_init(&rt_b->rt_period_timer, |
187 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 188 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
@@ -203,7 +204,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
203 | if (hrtimer_active(&rt_b->rt_period_timer)) | 204 | if (hrtimer_active(&rt_b->rt_period_timer)) |
204 | return; | 205 | return; |
205 | 206 | ||
206 | spin_lock(&rt_b->rt_runtime_lock); | 207 | raw_spin_lock(&rt_b->rt_runtime_lock); |
207 | for (;;) { | 208 | for (;;) { |
208 | unsigned long delta; | 209 | unsigned long delta; |
209 | ktime_t soft, hard; | 210 | ktime_t soft, hard; |
@@ -220,7 +221,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
220 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | 221 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, |
221 | HRTIMER_MODE_ABS_PINNED, 0); | 222 | HRTIMER_MODE_ABS_PINNED, 0); |
222 | } | 223 | } |
223 | spin_unlock(&rt_b->rt_runtime_lock); | 224 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
224 | } | 225 | } |
225 | 226 | ||
226 | #ifdef CONFIG_RT_GROUP_SCHED | 227 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -236,7 +237,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
236 | */ | 237 | */ |
237 | static DEFINE_MUTEX(sched_domains_mutex); | 238 | static DEFINE_MUTEX(sched_domains_mutex); |
238 | 239 | ||
239 | #ifdef CONFIG_GROUP_SCHED | 240 | #ifdef CONFIG_CGROUP_SCHED |
240 | 241 | ||
241 | #include <linux/cgroup.h> | 242 | #include <linux/cgroup.h> |
242 | 243 | ||
@@ -246,13 +247,7 @@ static LIST_HEAD(task_groups); | |||
246 | 247 | ||
247 | /* task group related information */ | 248 | /* task group related information */ |
248 | struct task_group { | 249 | struct task_group { |
249 | #ifdef CONFIG_CGROUP_SCHED | ||
250 | struct cgroup_subsys_state css; | 250 | struct cgroup_subsys_state css; |
251 | #endif | ||
252 | |||
253 | #ifdef CONFIG_USER_SCHED | ||
254 | uid_t uid; | ||
255 | #endif | ||
256 | 251 | ||
257 | #ifdef CONFIG_FAIR_GROUP_SCHED | 252 | #ifdef CONFIG_FAIR_GROUP_SCHED |
258 | /* schedulable entities of this group on each cpu */ | 253 | /* schedulable entities of this group on each cpu */ |
@@ -277,35 +272,7 @@ struct task_group { | |||
277 | struct list_head children; | 272 | struct list_head children; |
278 | }; | 273 | }; |
279 | 274 | ||
280 | #ifdef CONFIG_USER_SCHED | ||
281 | |||
282 | /* Helper function to pass uid information to create_sched_user() */ | ||
283 | void set_tg_uid(struct user_struct *user) | ||
284 | { | ||
285 | user->tg->uid = user->uid; | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Root task group. | ||
290 | * Every UID task group (including init_task_group aka UID-0) will | ||
291 | * be a child to this group. | ||
292 | */ | ||
293 | struct task_group root_task_group; | ||
294 | |||
295 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
296 | /* Default task group's sched entity on each cpu */ | ||
297 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
298 | /* Default task group's cfs_rq on each cpu */ | ||
299 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); | ||
300 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
301 | |||
302 | #ifdef CONFIG_RT_GROUP_SCHED | ||
303 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
304 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); | ||
305 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
306 | #else /* !CONFIG_USER_SCHED */ | ||
307 | #define root_task_group init_task_group | 275 | #define root_task_group init_task_group |
308 | #endif /* CONFIG_USER_SCHED */ | ||
309 | 276 | ||
310 | /* task_group_lock serializes add/remove of task groups and also changes to | 277 | /* task_group_lock serializes add/remove of task groups and also changes to |
311 | * a task group's cpu shares. | 278 | * a task group's cpu shares. |
@@ -321,11 +288,7 @@ static int root_task_group_empty(void) | |||
321 | } | 288 | } |
322 | #endif | 289 | #endif |
323 | 290 | ||
324 | #ifdef CONFIG_USER_SCHED | ||
325 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | ||
326 | #else /* !CONFIG_USER_SCHED */ | ||
327 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 291 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
328 | #endif /* CONFIG_USER_SCHED */ | ||
329 | 292 | ||
330 | /* | 293 | /* |
331 | * A weight of 0 or 1 can cause arithmetics problems. | 294 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -351,11 +314,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
351 | { | 314 | { |
352 | struct task_group *tg; | 315 | struct task_group *tg; |
353 | 316 | ||
354 | #ifdef CONFIG_USER_SCHED | 317 | #ifdef CONFIG_CGROUP_SCHED |
355 | rcu_read_lock(); | ||
356 | tg = __task_cred(p)->user->tg; | ||
357 | rcu_read_unlock(); | ||
358 | #elif defined(CONFIG_CGROUP_SCHED) | ||
359 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 318 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
360 | struct task_group, css); | 319 | struct task_group, css); |
361 | #else | 320 | #else |
@@ -367,6 +326,15 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
367 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 326 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
368 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | 327 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
369 | { | 328 | { |
329 | /* | ||
330 | * Strictly speaking this rcu_read_lock() is not needed since the | ||
331 | * task_group is tied to the cgroup, which in turn can never go away | ||
332 | * as long as there are tasks attached to it. | ||
333 | * | ||
334 | * However since task_group() uses task_subsys_state() which is an | ||
335 | * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. | ||
336 | */ | ||
337 | rcu_read_lock(); | ||
370 | #ifdef CONFIG_FAIR_GROUP_SCHED | 338 | #ifdef CONFIG_FAIR_GROUP_SCHED |
371 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 339 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
372 | p->se.parent = task_group(p)->se[cpu]; | 340 | p->se.parent = task_group(p)->se[cpu]; |
@@ -376,6 +344,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
376 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | 344 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; |
377 | p->rt.parent = task_group(p)->rt_se[cpu]; | 345 | p->rt.parent = task_group(p)->rt_se[cpu]; |
378 | #endif | 346 | #endif |
347 | rcu_read_unlock(); | ||
379 | } | 348 | } |
380 | 349 | ||
381 | #else | 350 | #else |
@@ -386,7 +355,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
386 | return NULL; | 355 | return NULL; |
387 | } | 356 | } |
388 | 357 | ||
389 | #endif /* CONFIG_GROUP_SCHED */ | 358 | #endif /* CONFIG_CGROUP_SCHED */ |
390 | 359 | ||
391 | /* CFS-related fields in a runqueue */ | 360 | /* CFS-related fields in a runqueue */ |
392 | struct cfs_rq { | 361 | struct cfs_rq { |
@@ -473,7 +442,7 @@ struct rt_rq { | |||
473 | u64 rt_time; | 442 | u64 rt_time; |
474 | u64 rt_runtime; | 443 | u64 rt_runtime; |
475 | /* Nests inside the rq lock: */ | 444 | /* Nests inside the rq lock: */ |
476 | spinlock_t rt_runtime_lock; | 445 | raw_spinlock_t rt_runtime_lock; |
477 | 446 | ||
478 | #ifdef CONFIG_RT_GROUP_SCHED | 447 | #ifdef CONFIG_RT_GROUP_SCHED |
479 | unsigned long rt_nr_boosted; | 448 | unsigned long rt_nr_boosted; |
@@ -481,7 +450,6 @@ struct rt_rq { | |||
481 | struct rq *rq; | 450 | struct rq *rq; |
482 | struct list_head leaf_rt_rq_list; | 451 | struct list_head leaf_rt_rq_list; |
483 | struct task_group *tg; | 452 | struct task_group *tg; |
484 | struct sched_rt_entity *rt_se; | ||
485 | #endif | 453 | #endif |
486 | }; | 454 | }; |
487 | 455 | ||
@@ -534,7 +502,7 @@ static struct root_domain def_root_domain; | |||
534 | */ | 502 | */ |
535 | struct rq { | 503 | struct rq { |
536 | /* runqueue lock: */ | 504 | /* runqueue lock: */ |
537 | spinlock_t lock; | 505 | raw_spinlock_t lock; |
538 | 506 | ||
539 | /* | 507 | /* |
540 | * nr_running and cpu_load should be in the same cacheline because | 508 | * nr_running and cpu_load should be in the same cacheline because |
@@ -544,14 +512,12 @@ struct rq { | |||
544 | #define CPU_LOAD_IDX_MAX 5 | 512 | #define CPU_LOAD_IDX_MAX 5 |
545 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 513 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
546 | #ifdef CONFIG_NO_HZ | 514 | #ifdef CONFIG_NO_HZ |
547 | unsigned long last_tick_seen; | ||
548 | unsigned char in_nohz_recently; | 515 | unsigned char in_nohz_recently; |
549 | #endif | 516 | #endif |
550 | /* capture load from *all* tasks on this cpu: */ | 517 | /* capture load from *all* tasks on this cpu: */ |
551 | struct load_weight load; | 518 | struct load_weight load; |
552 | unsigned long nr_load_updates; | 519 | unsigned long nr_load_updates; |
553 | u64 nr_switches; | 520 | u64 nr_switches; |
554 | u64 nr_migrations_in; | ||
555 | 521 | ||
556 | struct cfs_rq cfs; | 522 | struct cfs_rq cfs; |
557 | struct rt_rq rt; | 523 | struct rt_rq rt; |
@@ -601,6 +567,8 @@ struct rq { | |||
601 | 567 | ||
602 | u64 rt_avg; | 568 | u64 rt_avg; |
603 | u64 age_stamp; | 569 | u64 age_stamp; |
570 | u64 idle_stamp; | ||
571 | u64 avg_idle; | ||
604 | #endif | 572 | #endif |
605 | 573 | ||
606 | /* calc_load related fields */ | 574 | /* calc_load related fields */ |
@@ -655,6 +623,11 @@ static inline int cpu_of(struct rq *rq) | |||
655 | #endif | 623 | #endif |
656 | } | 624 | } |
657 | 625 | ||
626 | #define rcu_dereference_check_sched_domain(p) \ | ||
627 | rcu_dereference_check((p), \ | ||
628 | rcu_read_lock_sched_held() || \ | ||
629 | lockdep_is_held(&sched_domains_mutex)) | ||
630 | |||
658 | /* | 631 | /* |
659 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 632 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
660 | * See detach_destroy_domains: synchronize_sched for details. | 633 | * See detach_destroy_domains: synchronize_sched for details. |
@@ -663,7 +636,7 @@ static inline int cpu_of(struct rq *rq) | |||
663 | * preempt-disabled sections. | 636 | * preempt-disabled sections. |
664 | */ | 637 | */ |
665 | #define for_each_domain(cpu, __sd) \ | 638 | #define for_each_domain(cpu, __sd) \ |
666 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 639 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
667 | 640 | ||
668 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 641 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
669 | #define this_rq() (&__get_cpu_var(runqueues)) | 642 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -695,7 +668,7 @@ inline void update_rq_clock(struct rq *rq) | |||
695 | */ | 668 | */ |
696 | int runqueue_is_locked(int cpu) | 669 | int runqueue_is_locked(int cpu) |
697 | { | 670 | { |
698 | return spin_is_locked(&cpu_rq(cpu)->lock); | 671 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); |
699 | } | 672 | } |
700 | 673 | ||
701 | /* | 674 | /* |
@@ -782,7 +755,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
782 | if (!sched_feat_names[i]) | 755 | if (!sched_feat_names[i]) |
783 | return -EINVAL; | 756 | return -EINVAL; |
784 | 757 | ||
785 | filp->f_pos += cnt; | 758 | *ppos += cnt; |
786 | 759 | ||
787 | return cnt; | 760 | return cnt; |
788 | } | 761 | } |
@@ -824,6 +797,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
824 | * default: 0.25ms | 797 | * default: 0.25ms |
825 | */ | 798 | */ |
826 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 799 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
800 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
827 | 801 | ||
828 | /* | 802 | /* |
829 | * Inject some fuzzyness into changing the per-cpu group shares | 803 | * Inject some fuzzyness into changing the per-cpu group shares |
@@ -902,7 +876,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
902 | */ | 876 | */ |
903 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 877 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
904 | 878 | ||
905 | spin_unlock_irq(&rq->lock); | 879 | raw_spin_unlock_irq(&rq->lock); |
906 | } | 880 | } |
907 | 881 | ||
908 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 882 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
@@ -926,9 +900,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
926 | next->oncpu = 1; | 900 | next->oncpu = 1; |
927 | #endif | 901 | #endif |
928 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 902 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
929 | spin_unlock_irq(&rq->lock); | 903 | raw_spin_unlock_irq(&rq->lock); |
930 | #else | 904 | #else |
931 | spin_unlock(&rq->lock); | 905 | raw_spin_unlock(&rq->lock); |
932 | #endif | 906 | #endif |
933 | } | 907 | } |
934 | 908 | ||
@@ -950,18 +924,35 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
950 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 924 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
951 | 925 | ||
952 | /* | 926 | /* |
927 | * Check whether the task is waking, we use this to synchronize against | ||
928 | * ttwu() so that task_cpu() reports a stable number. | ||
929 | * | ||
930 | * We need to make an exception for PF_STARTING tasks because the fork | ||
931 | * path might require task_rq_lock() to work, eg. it can call | ||
932 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
933 | */ | ||
934 | static inline int task_is_waking(struct task_struct *p) | ||
935 | { | ||
936 | return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); | ||
937 | } | ||
938 | |||
939 | /* | ||
953 | * __task_rq_lock - lock the runqueue a given task resides on. | 940 | * __task_rq_lock - lock the runqueue a given task resides on. |
954 | * Must be called interrupts disabled. | 941 | * Must be called interrupts disabled. |
955 | */ | 942 | */ |
956 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 943 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
957 | __acquires(rq->lock) | 944 | __acquires(rq->lock) |
958 | { | 945 | { |
946 | struct rq *rq; | ||
947 | |||
959 | for (;;) { | 948 | for (;;) { |
960 | struct rq *rq = task_rq(p); | 949 | while (task_is_waking(p)) |
961 | spin_lock(&rq->lock); | 950 | cpu_relax(); |
962 | if (likely(rq == task_rq(p))) | 951 | rq = task_rq(p); |
952 | raw_spin_lock(&rq->lock); | ||
953 | if (likely(rq == task_rq(p) && !task_is_waking(p))) | ||
963 | return rq; | 954 | return rq; |
964 | spin_unlock(&rq->lock); | 955 | raw_spin_unlock(&rq->lock); |
965 | } | 956 | } |
966 | } | 957 | } |
967 | 958 | ||
@@ -976,12 +967,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
976 | struct rq *rq; | 967 | struct rq *rq; |
977 | 968 | ||
978 | for (;;) { | 969 | for (;;) { |
970 | while (task_is_waking(p)) | ||
971 | cpu_relax(); | ||
979 | local_irq_save(*flags); | 972 | local_irq_save(*flags); |
980 | rq = task_rq(p); | 973 | rq = task_rq(p); |
981 | spin_lock(&rq->lock); | 974 | raw_spin_lock(&rq->lock); |
982 | if (likely(rq == task_rq(p))) | 975 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
983 | return rq; | 976 | return rq; |
984 | spin_unlock_irqrestore(&rq->lock, *flags); | 977 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
985 | } | 978 | } |
986 | } | 979 | } |
987 | 980 | ||
@@ -990,19 +983,19 @@ void task_rq_unlock_wait(struct task_struct *p) | |||
990 | struct rq *rq = task_rq(p); | 983 | struct rq *rq = task_rq(p); |
991 | 984 | ||
992 | smp_mb(); /* spin-unlock-wait is not a full memory barrier */ | 985 | smp_mb(); /* spin-unlock-wait is not a full memory barrier */ |
993 | spin_unlock_wait(&rq->lock); | 986 | raw_spin_unlock_wait(&rq->lock); |
994 | } | 987 | } |
995 | 988 | ||
996 | static void __task_rq_unlock(struct rq *rq) | 989 | static void __task_rq_unlock(struct rq *rq) |
997 | __releases(rq->lock) | 990 | __releases(rq->lock) |
998 | { | 991 | { |
999 | spin_unlock(&rq->lock); | 992 | raw_spin_unlock(&rq->lock); |
1000 | } | 993 | } |
1001 | 994 | ||
1002 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 995 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) |
1003 | __releases(rq->lock) | 996 | __releases(rq->lock) |
1004 | { | 997 | { |
1005 | spin_unlock_irqrestore(&rq->lock, *flags); | 998 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
1006 | } | 999 | } |
1007 | 1000 | ||
1008 | /* | 1001 | /* |
@@ -1015,7 +1008,7 @@ static struct rq *this_rq_lock(void) | |||
1015 | 1008 | ||
1016 | local_irq_disable(); | 1009 | local_irq_disable(); |
1017 | rq = this_rq(); | 1010 | rq = this_rq(); |
1018 | spin_lock(&rq->lock); | 1011 | raw_spin_lock(&rq->lock); |
1019 | 1012 | ||
1020 | return rq; | 1013 | return rq; |
1021 | } | 1014 | } |
@@ -1062,10 +1055,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) | |||
1062 | 1055 | ||
1063 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 1056 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
1064 | 1057 | ||
1065 | spin_lock(&rq->lock); | 1058 | raw_spin_lock(&rq->lock); |
1066 | update_rq_clock(rq); | 1059 | update_rq_clock(rq); |
1067 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | 1060 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); |
1068 | spin_unlock(&rq->lock); | 1061 | raw_spin_unlock(&rq->lock); |
1069 | 1062 | ||
1070 | return HRTIMER_NORESTART; | 1063 | return HRTIMER_NORESTART; |
1071 | } | 1064 | } |
@@ -1078,10 +1071,10 @@ static void __hrtick_start(void *arg) | |||
1078 | { | 1071 | { |
1079 | struct rq *rq = arg; | 1072 | struct rq *rq = arg; |
1080 | 1073 | ||
1081 | spin_lock(&rq->lock); | 1074 | raw_spin_lock(&rq->lock); |
1082 | hrtimer_restart(&rq->hrtick_timer); | 1075 | hrtimer_restart(&rq->hrtick_timer); |
1083 | rq->hrtick_csd_pending = 0; | 1076 | rq->hrtick_csd_pending = 0; |
1084 | spin_unlock(&rq->lock); | 1077 | raw_spin_unlock(&rq->lock); |
1085 | } | 1078 | } |
1086 | 1079 | ||
1087 | /* | 1080 | /* |
@@ -1188,7 +1181,7 @@ static void resched_task(struct task_struct *p) | |||
1188 | { | 1181 | { |
1189 | int cpu; | 1182 | int cpu; |
1190 | 1183 | ||
1191 | assert_spin_locked(&task_rq(p)->lock); | 1184 | assert_raw_spin_locked(&task_rq(p)->lock); |
1192 | 1185 | ||
1193 | if (test_tsk_need_resched(p)) | 1186 | if (test_tsk_need_resched(p)) |
1194 | return; | 1187 | return; |
@@ -1210,10 +1203,10 @@ static void resched_cpu(int cpu) | |||
1210 | struct rq *rq = cpu_rq(cpu); | 1203 | struct rq *rq = cpu_rq(cpu); |
1211 | unsigned long flags; | 1204 | unsigned long flags; |
1212 | 1205 | ||
1213 | if (!spin_trylock_irqsave(&rq->lock, flags)) | 1206 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) |
1214 | return; | 1207 | return; |
1215 | resched_task(cpu_curr(cpu)); | 1208 | resched_task(cpu_curr(cpu)); |
1216 | spin_unlock_irqrestore(&rq->lock, flags); | 1209 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1217 | } | 1210 | } |
1218 | 1211 | ||
1219 | #ifdef CONFIG_NO_HZ | 1212 | #ifdef CONFIG_NO_HZ |
@@ -1282,7 +1275,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | |||
1282 | #else /* !CONFIG_SMP */ | 1275 | #else /* !CONFIG_SMP */ |
1283 | static void resched_task(struct task_struct *p) | 1276 | static void resched_task(struct task_struct *p) |
1284 | { | 1277 | { |
1285 | assert_spin_locked(&task_rq(p)->lock); | 1278 | assert_raw_spin_locked(&task_rq(p)->lock); |
1286 | set_tsk_need_resched(p); | 1279 | set_tsk_need_resched(p); |
1287 | } | 1280 | } |
1288 | 1281 | ||
@@ -1399,32 +1392,6 @@ static const u32 prio_to_wmult[40] = { | |||
1399 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1392 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1400 | }; | 1393 | }; |
1401 | 1394 | ||
1402 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
1403 | |||
1404 | /* | ||
1405 | * runqueue iterator, to support SMP load-balancing between different | ||
1406 | * scheduling classes, without having to expose their internal data | ||
1407 | * structures to the load-balancing proper: | ||
1408 | */ | ||
1409 | struct rq_iterator { | ||
1410 | void *arg; | ||
1411 | struct task_struct *(*start)(void *); | ||
1412 | struct task_struct *(*next)(void *); | ||
1413 | }; | ||
1414 | |||
1415 | #ifdef CONFIG_SMP | ||
1416 | static unsigned long | ||
1417 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1418 | unsigned long max_load_move, struct sched_domain *sd, | ||
1419 | enum cpu_idle_type idle, int *all_pinned, | ||
1420 | int *this_best_prio, struct rq_iterator *iterator); | ||
1421 | |||
1422 | static int | ||
1423 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1424 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1425 | struct rq_iterator *iterator); | ||
1426 | #endif | ||
1427 | |||
1428 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 1395 | /* Time spent by the tasks of the cpu accounting group executing in ... */ |
1429 | enum cpuacct_stat_index { | 1396 | enum cpuacct_stat_index { |
1430 | CPUACCT_STAT_USER, /* ... user mode */ | 1397 | CPUACCT_STAT_USER, /* ... user mode */ |
@@ -1540,7 +1507,7 @@ static unsigned long target_load(int cpu, int type) | |||
1540 | 1507 | ||
1541 | static struct sched_group *group_of(int cpu) | 1508 | static struct sched_group *group_of(int cpu) |
1542 | { | 1509 | { |
1543 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | 1510 | struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); |
1544 | 1511 | ||
1545 | if (!sd) | 1512 | if (!sd) |
1546 | return NULL; | 1513 | return NULL; |
@@ -1575,7 +1542,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1575 | 1542 | ||
1576 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1543 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1577 | 1544 | ||
1578 | static __read_mostly unsigned long *update_shares_data; | 1545 | static __read_mostly unsigned long __percpu *update_shares_data; |
1579 | 1546 | ||
1580 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1547 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1581 | 1548 | ||
@@ -1609,11 +1576,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1609 | struct rq *rq = cpu_rq(cpu); | 1576 | struct rq *rq = cpu_rq(cpu); |
1610 | unsigned long flags; | 1577 | unsigned long flags; |
1611 | 1578 | ||
1612 | spin_lock_irqsave(&rq->lock, flags); | 1579 | raw_spin_lock_irqsave(&rq->lock, flags); |
1613 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | 1580 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1614 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | 1581 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; |
1615 | __set_se_shares(tg->se[cpu], shares); | 1582 | __set_se_shares(tg->se[cpu], shares); |
1616 | spin_unlock_irqrestore(&rq->lock, flags); | 1583 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1617 | } | 1584 | } |
1618 | } | 1585 | } |
1619 | 1586 | ||
@@ -1624,7 +1591,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1624 | */ | 1591 | */ |
1625 | static int tg_shares_up(struct task_group *tg, void *data) | 1592 | static int tg_shares_up(struct task_group *tg, void *data) |
1626 | { | 1593 | { |
1627 | unsigned long weight, rq_weight = 0, shares = 0; | 1594 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; |
1628 | unsigned long *usd_rq_weight; | 1595 | unsigned long *usd_rq_weight; |
1629 | struct sched_domain *sd = data; | 1596 | struct sched_domain *sd = data; |
1630 | unsigned long flags; | 1597 | unsigned long flags; |
@@ -1640,6 +1607,7 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1640 | weight = tg->cfs_rq[i]->load.weight; | 1607 | weight = tg->cfs_rq[i]->load.weight; |
1641 | usd_rq_weight[i] = weight; | 1608 | usd_rq_weight[i] = weight; |
1642 | 1609 | ||
1610 | rq_weight += weight; | ||
1643 | /* | 1611 | /* |
1644 | * If there are currently no tasks on the cpu pretend there | 1612 | * If there are currently no tasks on the cpu pretend there |
1645 | * is one of average load so that when a new task gets to | 1613 | * is one of average load so that when a new task gets to |
@@ -1648,10 +1616,13 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1648 | if (!weight) | 1616 | if (!weight) |
1649 | weight = NICE_0_LOAD; | 1617 | weight = NICE_0_LOAD; |
1650 | 1618 | ||
1651 | rq_weight += weight; | 1619 | sum_weight += weight; |
1652 | shares += tg->cfs_rq[i]->shares; | 1620 | shares += tg->cfs_rq[i]->shares; |
1653 | } | 1621 | } |
1654 | 1622 | ||
1623 | if (!rq_weight) | ||
1624 | rq_weight = sum_weight; | ||
1625 | |||
1655 | if ((!shares && rq_weight) || shares > tg->shares) | 1626 | if ((!shares && rq_weight) || shares > tg->shares) |
1656 | shares = tg->shares; | 1627 | shares = tg->shares; |
1657 | 1628 | ||
@@ -1706,16 +1677,6 @@ static void update_shares(struct sched_domain *sd) | |||
1706 | } | 1677 | } |
1707 | } | 1678 | } |
1708 | 1679 | ||
1709 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1710 | { | ||
1711 | if (root_task_group_empty()) | ||
1712 | return; | ||
1713 | |||
1714 | spin_unlock(&rq->lock); | ||
1715 | update_shares(sd); | ||
1716 | spin_lock(&rq->lock); | ||
1717 | } | ||
1718 | |||
1719 | static void update_h_load(long cpu) | 1680 | static void update_h_load(long cpu) |
1720 | { | 1681 | { |
1721 | if (root_task_group_empty()) | 1682 | if (root_task_group_empty()) |
@@ -1730,10 +1691,6 @@ static inline void update_shares(struct sched_domain *sd) | |||
1730 | { | 1691 | { |
1731 | } | 1692 | } |
1732 | 1693 | ||
1733 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1734 | { | ||
1735 | } | ||
1736 | |||
1737 | #endif | 1694 | #endif |
1738 | 1695 | ||
1739 | #ifdef CONFIG_PREEMPT | 1696 | #ifdef CONFIG_PREEMPT |
@@ -1753,7 +1710,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1753 | __acquires(busiest->lock) | 1710 | __acquires(busiest->lock) |
1754 | __acquires(this_rq->lock) | 1711 | __acquires(this_rq->lock) |
1755 | { | 1712 | { |
1756 | spin_unlock(&this_rq->lock); | 1713 | raw_spin_unlock(&this_rq->lock); |
1757 | double_rq_lock(this_rq, busiest); | 1714 | double_rq_lock(this_rq, busiest); |
1758 | 1715 | ||
1759 | return 1; | 1716 | return 1; |
@@ -1774,14 +1731,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1774 | { | 1731 | { |
1775 | int ret = 0; | 1732 | int ret = 0; |
1776 | 1733 | ||
1777 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1734 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { |
1778 | if (busiest < this_rq) { | 1735 | if (busiest < this_rq) { |
1779 | spin_unlock(&this_rq->lock); | 1736 | raw_spin_unlock(&this_rq->lock); |
1780 | spin_lock(&busiest->lock); | 1737 | raw_spin_lock(&busiest->lock); |
1781 | spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); | 1738 | raw_spin_lock_nested(&this_rq->lock, |
1739 | SINGLE_DEPTH_NESTING); | ||
1782 | ret = 1; | 1740 | ret = 1; |
1783 | } else | 1741 | } else |
1784 | spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); | 1742 | raw_spin_lock_nested(&busiest->lock, |
1743 | SINGLE_DEPTH_NESTING); | ||
1785 | } | 1744 | } |
1786 | return ret; | 1745 | return ret; |
1787 | } | 1746 | } |
@@ -1795,7 +1754,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1795 | { | 1754 | { |
1796 | if (unlikely(!irqs_disabled())) { | 1755 | if (unlikely(!irqs_disabled())) { |
1797 | /* printk() doesn't work good under rq->lock */ | 1756 | /* printk() doesn't work good under rq->lock */ |
1798 | spin_unlock(&this_rq->lock); | 1757 | raw_spin_unlock(&this_rq->lock); |
1799 | BUG_ON(1); | 1758 | BUG_ON(1); |
1800 | } | 1759 | } |
1801 | 1760 | ||
@@ -1805,9 +1764,54 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1805 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | 1764 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) |
1806 | __releases(busiest->lock) | 1765 | __releases(busiest->lock) |
1807 | { | 1766 | { |
1808 | spin_unlock(&busiest->lock); | 1767 | raw_spin_unlock(&busiest->lock); |
1809 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1768 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
1810 | } | 1769 | } |
1770 | |||
1771 | /* | ||
1772 | * double_rq_lock - safely lock two runqueues | ||
1773 | * | ||
1774 | * Note this does not disable interrupts like task_rq_lock, | ||
1775 | * you need to do so manually before calling. | ||
1776 | */ | ||
1777 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1778 | __acquires(rq1->lock) | ||
1779 | __acquires(rq2->lock) | ||
1780 | { | ||
1781 | BUG_ON(!irqs_disabled()); | ||
1782 | if (rq1 == rq2) { | ||
1783 | raw_spin_lock(&rq1->lock); | ||
1784 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1785 | } else { | ||
1786 | if (rq1 < rq2) { | ||
1787 | raw_spin_lock(&rq1->lock); | ||
1788 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1789 | } else { | ||
1790 | raw_spin_lock(&rq2->lock); | ||
1791 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1792 | } | ||
1793 | } | ||
1794 | update_rq_clock(rq1); | ||
1795 | update_rq_clock(rq2); | ||
1796 | } | ||
1797 | |||
1798 | /* | ||
1799 | * double_rq_unlock - safely unlock two runqueues | ||
1800 | * | ||
1801 | * Note this does not restore interrupts like task_rq_unlock, | ||
1802 | * you need to do so manually after calling. | ||
1803 | */ | ||
1804 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1805 | __releases(rq1->lock) | ||
1806 | __releases(rq2->lock) | ||
1807 | { | ||
1808 | raw_spin_unlock(&rq1->lock); | ||
1809 | if (rq1 != rq2) | ||
1810 | raw_spin_unlock(&rq2->lock); | ||
1811 | else | ||
1812 | __release(rq2->lock); | ||
1813 | } | ||
1814 | |||
1811 | #endif | 1815 | #endif |
1812 | 1816 | ||
1813 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1817 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1820,20 +1824,31 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1820 | #endif | 1824 | #endif |
1821 | 1825 | ||
1822 | static void calc_load_account_active(struct rq *this_rq); | 1826 | static void calc_load_account_active(struct rq *this_rq); |
1827 | static void update_sysctl(void); | ||
1828 | static int get_update_sysctl_factor(void); | ||
1823 | 1829 | ||
1824 | #include "sched_stats.h" | 1830 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1825 | #include "sched_idletask.c" | 1831 | { |
1826 | #include "sched_fair.c" | 1832 | set_task_rq(p, cpu); |
1827 | #include "sched_rt.c" | 1833 | #ifdef CONFIG_SMP |
1828 | #include "../litmus/sched_litmus.c" | 1834 | /* |
1829 | #ifdef CONFIG_SCHED_DEBUG | 1835 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
1830 | # include "sched_debug.c" | 1836 | * successfuly executed on another CPU. We must ensure that updates of |
1837 | * per-task data have been completed by this moment. | ||
1838 | */ | ||
1839 | smp_wmb(); | ||
1840 | task_thread_info(p)->cpu = cpu; | ||
1831 | #endif | 1841 | #endif |
1842 | } | ||
1843 | |||
1844 | static const struct sched_class rt_sched_class; | ||
1832 | 1845 | ||
1833 | #define sched_class_highest (&litmus_sched_class) | 1846 | #define sched_class_highest (&litmus_sched_class) |
1834 | #define for_each_class(class) \ | 1847 | #define for_each_class(class) \ |
1835 | for (class = sched_class_highest; class; class = class->next) | 1848 | for (class = sched_class_highest; class; class = class->next) |
1836 | 1849 | ||
1850 | #include "sched_stats.h" | ||
1851 | |||
1837 | static void inc_nr_running(struct rq *rq) | 1852 | static void inc_nr_running(struct rq *rq) |
1838 | { | 1853 | { |
1839 | rq->nr_running++; | 1854 | rq->nr_running++; |
@@ -1871,13 +1886,14 @@ static void update_avg(u64 *avg, u64 sample) | |||
1871 | *avg += diff >> 3; | 1886 | *avg += diff >> 3; |
1872 | } | 1887 | } |
1873 | 1888 | ||
1874 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1889 | static void |
1890 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1875 | { | 1891 | { |
1876 | if (wakeup) | 1892 | if (wakeup) |
1877 | p->se.start_runtime = p->se.sum_exec_runtime; | 1893 | p->se.start_runtime = p->se.sum_exec_runtime; |
1878 | 1894 | ||
1879 | sched_info_queued(p); | 1895 | sched_info_queued(p); |
1880 | p->sched_class->enqueue_task(rq, p, wakeup); | 1896 | p->sched_class->enqueue_task(rq, p, wakeup, head); |
1881 | p->se.on_rq = 1; | 1897 | p->se.on_rq = 1; |
1882 | } | 1898 | } |
1883 | 1899 | ||
@@ -1900,6 +1916,38 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1900 | } | 1916 | } |
1901 | 1917 | ||
1902 | /* | 1918 | /* |
1919 | * activate_task - move a task to the runqueue. | ||
1920 | */ | ||
1921 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1922 | { | ||
1923 | if (task_contributes_to_load(p)) | ||
1924 | rq->nr_uninterruptible--; | ||
1925 | |||
1926 | enqueue_task(rq, p, wakeup, false); | ||
1927 | inc_nr_running(rq); | ||
1928 | } | ||
1929 | |||
1930 | /* | ||
1931 | * deactivate_task - remove a task from the runqueue. | ||
1932 | */ | ||
1933 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1934 | { | ||
1935 | if (task_contributes_to_load(p)) | ||
1936 | rq->nr_uninterruptible++; | ||
1937 | |||
1938 | dequeue_task(rq, p, sleep); | ||
1939 | dec_nr_running(rq); | ||
1940 | } | ||
1941 | |||
1942 | #include "sched_idletask.c" | ||
1943 | #include "sched_fair.c" | ||
1944 | #include "sched_rt.c" | ||
1945 | #include "../litmus/sched_litmus.c" | ||
1946 | #ifdef CONFIG_SCHED_DEBUG | ||
1947 | # include "sched_debug.c" | ||
1948 | #endif | ||
1949 | |||
1950 | /* | ||
1903 | * __normal_prio - return the priority that is based on the static prio | 1951 | * __normal_prio - return the priority that is based on the static prio |
1904 | */ | 1952 | */ |
1905 | static inline int __normal_prio(struct task_struct *p) | 1953 | static inline int __normal_prio(struct task_struct *p) |
@@ -1945,30 +1993,6 @@ static int effective_prio(struct task_struct *p) | |||
1945 | return p->prio; | 1993 | return p->prio; |
1946 | } | 1994 | } |
1947 | 1995 | ||
1948 | /* | ||
1949 | * activate_task - move a task to the runqueue. | ||
1950 | */ | ||
1951 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1952 | { | ||
1953 | if (task_contributes_to_load(p)) | ||
1954 | rq->nr_uninterruptible--; | ||
1955 | |||
1956 | enqueue_task(rq, p, wakeup); | ||
1957 | inc_nr_running(rq); | ||
1958 | } | ||
1959 | |||
1960 | /* | ||
1961 | * deactivate_task - remove a task from the runqueue. | ||
1962 | */ | ||
1963 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1964 | { | ||
1965 | if (task_contributes_to_load(p)) | ||
1966 | rq->nr_uninterruptible++; | ||
1967 | |||
1968 | dequeue_task(rq, p, sleep); | ||
1969 | dec_nr_running(rq); | ||
1970 | } | ||
1971 | |||
1972 | /** | 1996 | /** |
1973 | * task_curr - is this task currently executing on a CPU? | 1997 | * task_curr - is this task currently executing on a CPU? |
1974 | * @p: the task in question. | 1998 | * @p: the task in question. |
@@ -1978,20 +2002,6 @@ inline int task_curr(const struct task_struct *p) | |||
1978 | return cpu_curr(task_cpu(p)) == p; | 2002 | return cpu_curr(task_cpu(p)) == p; |
1979 | } | 2003 | } |
1980 | 2004 | ||
1981 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
1982 | { | ||
1983 | set_task_rq(p, cpu); | ||
1984 | #ifdef CONFIG_SMP | ||
1985 | /* | ||
1986 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
1987 | * successfuly executed on another CPU. We must ensure that updates of | ||
1988 | * per-task data have been completed by this moment. | ||
1989 | */ | ||
1990 | smp_wmb(); | ||
1991 | task_thread_info(p)->cpu = cpu; | ||
1992 | #endif | ||
1993 | } | ||
1994 | |||
1995 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2005 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
1996 | const struct sched_class *prev_class, | 2006 | const struct sched_class *prev_class, |
1997 | int oldprio, int running) | 2007 | int oldprio, int running) |
@@ -2004,38 +2014,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2004 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2014 | p->sched_class->prio_changed(rq, p, oldprio, running); |
2005 | } | 2015 | } |
2006 | 2016 | ||
2007 | /** | ||
2008 | * kthread_bind - bind a just-created kthread to a cpu. | ||
2009 | * @p: thread created by kthread_create(). | ||
2010 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
2011 | * | ||
2012 | * Description: This function is equivalent to set_cpus_allowed(), | ||
2013 | * except that @cpu doesn't need to be online, and the thread must be | ||
2014 | * stopped (i.e., just returned from kthread_create()). | ||
2015 | * | ||
2016 | * Function lives here instead of kthread.c because it messes with | ||
2017 | * scheduler internals which require locking. | ||
2018 | */ | ||
2019 | void kthread_bind(struct task_struct *p, unsigned int cpu) | ||
2020 | { | ||
2021 | struct rq *rq = cpu_rq(cpu); | ||
2022 | unsigned long flags; | ||
2023 | |||
2024 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
2025 | if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { | ||
2026 | WARN_ON(1); | ||
2027 | return; | ||
2028 | } | ||
2029 | |||
2030 | spin_lock_irqsave(&rq->lock, flags); | ||
2031 | set_task_cpu(p, cpu); | ||
2032 | p->cpus_allowed = cpumask_of_cpu(cpu); | ||
2033 | p->rt.nr_cpus_allowed = 1; | ||
2034 | p->flags |= PF_THREAD_BOUND; | ||
2035 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2036 | } | ||
2037 | EXPORT_SYMBOL(kthread_bind); | ||
2038 | |||
2039 | #ifdef CONFIG_SMP | 2017 | #ifdef CONFIG_SMP |
2040 | /* | 2018 | /* |
2041 | * Is this task likely cache-hot: | 2019 | * Is this task likely cache-hot: |
@@ -2045,6 +2023,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2045 | { | 2023 | { |
2046 | s64 delta; | 2024 | s64 delta; |
2047 | 2025 | ||
2026 | if (p->sched_class != &fair_sched_class) | ||
2027 | return 0; | ||
2028 | |||
2048 | /* | 2029 | /* |
2049 | * Buddy candidates are cache hot: | 2030 | * Buddy candidates are cache hot: |
2050 | */ | 2031 | */ |
@@ -2053,9 +2034,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2053 | &p->se == cfs_rq_of(&p->se)->last)) | 2034 | &p->se == cfs_rq_of(&p->se)->last)) |
2054 | return 1; | 2035 | return 1; |
2055 | 2036 | ||
2056 | if (p->sched_class != &fair_sched_class) | ||
2057 | return 0; | ||
2058 | |||
2059 | if (sysctl_sched_migration_cost == -1) | 2037 | if (sysctl_sched_migration_cost == -1) |
2060 | return 1; | 2038 | return 1; |
2061 | if (sysctl_sched_migration_cost == 0) | 2039 | if (sysctl_sched_migration_cost == 0) |
@@ -2066,39 +2044,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2066 | return delta < (s64)sysctl_sched_migration_cost; | 2044 | return delta < (s64)sysctl_sched_migration_cost; |
2067 | } | 2045 | } |
2068 | 2046 | ||
2069 | |||
2070 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 2047 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2071 | { | 2048 | { |
2072 | int old_cpu = task_cpu(p); | 2049 | #ifdef CONFIG_SCHED_DEBUG |
2073 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 2050 | /* |
2074 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), | 2051 | * We should never call set_task_cpu() on a blocked task, |
2075 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | 2052 | * ttwu() will sort out the placement. |
2076 | u64 clock_offset; | 2053 | */ |
2077 | 2054 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | |
2078 | clock_offset = old_rq->clock - new_rq->clock; | 2055 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2056 | #endif | ||
2079 | 2057 | ||
2080 | trace_sched_migrate_task(p, new_cpu); | 2058 | trace_sched_migrate_task(p, new_cpu); |
2081 | 2059 | ||
2082 | #ifdef CONFIG_SCHEDSTATS | 2060 | if (task_cpu(p) != new_cpu) { |
2083 | if (p->se.wait_start) | ||
2084 | p->se.wait_start -= clock_offset; | ||
2085 | if (p->se.sleep_start) | ||
2086 | p->se.sleep_start -= clock_offset; | ||
2087 | if (p->se.block_start) | ||
2088 | p->se.block_start -= clock_offset; | ||
2089 | #endif | ||
2090 | if (old_cpu != new_cpu) { | ||
2091 | p->se.nr_migrations++; | 2061 | p->se.nr_migrations++; |
2092 | new_rq->nr_migrations_in++; | 2062 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); |
2093 | #ifdef CONFIG_SCHEDSTATS | ||
2094 | if (task_hot(p, old_rq->clock, NULL)) | ||
2095 | schedstat_inc(p, se.nr_forced2_migrations); | ||
2096 | #endif | ||
2097 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, | ||
2098 | 1, 1, NULL, 0); | ||
2099 | } | 2063 | } |
2100 | p->se.vruntime -= old_cfsrq->min_vruntime - | ||
2101 | new_cfsrq->min_vruntime; | ||
2102 | 2064 | ||
2103 | __set_task_cpu(p, new_cpu); | 2065 | __set_task_cpu(p, new_cpu); |
2104 | } | 2066 | } |
@@ -2123,12 +2085,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
2123 | 2085 | ||
2124 | /* | 2086 | /* |
2125 | * If the task is not on a runqueue (and not running), then | 2087 | * If the task is not on a runqueue (and not running), then |
2126 | * it is sufficient to simply update the task's cpu field. | 2088 | * the next wake-up will properly place the task. |
2127 | */ | 2089 | */ |
2128 | if (!p->se.on_rq && !task_running(rq, p)) { | 2090 | if (!p->se.on_rq && !task_running(rq, p)) |
2129 | set_task_cpu(p, dest_cpu); | ||
2130 | return 0; | 2091 | return 0; |
2131 | } | ||
2132 | 2092 | ||
2133 | init_completion(&req->done); | 2093 | init_completion(&req->done); |
2134 | req->task = p; | 2094 | req->task = p; |
@@ -2333,6 +2293,75 @@ void task_oncpu_function_call(struct task_struct *p, | |||
2333 | preempt_enable(); | 2293 | preempt_enable(); |
2334 | } | 2294 | } |
2335 | 2295 | ||
2296 | #ifdef CONFIG_SMP | ||
2297 | static int select_fallback_rq(int cpu, struct task_struct *p) | ||
2298 | { | ||
2299 | int dest_cpu; | ||
2300 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | ||
2301 | |||
2302 | /* Look for allowed, online CPU in same node. */ | ||
2303 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | ||
2304 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | ||
2305 | return dest_cpu; | ||
2306 | |||
2307 | /* Any allowed, online CPU? */ | ||
2308 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | ||
2309 | if (dest_cpu < nr_cpu_ids) | ||
2310 | return dest_cpu; | ||
2311 | |||
2312 | /* No more Mr. Nice Guy. */ | ||
2313 | if (dest_cpu >= nr_cpu_ids) { | ||
2314 | rcu_read_lock(); | ||
2315 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); | ||
2316 | rcu_read_unlock(); | ||
2317 | dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | ||
2318 | |||
2319 | /* | ||
2320 | * Don't tell them about moving exiting tasks or | ||
2321 | * kernel threads (both mm NULL), since they never | ||
2322 | * leave kernel. | ||
2323 | */ | ||
2324 | if (p->mm && printk_ratelimit()) { | ||
2325 | printk(KERN_INFO "process %d (%s) no " | ||
2326 | "longer affine to cpu%d\n", | ||
2327 | task_pid_nr(p), p->comm, cpu); | ||
2328 | } | ||
2329 | } | ||
2330 | |||
2331 | return dest_cpu; | ||
2332 | } | ||
2333 | |||
2334 | /* | ||
2335 | * Gets called from 3 sites (exec, fork, wakeup), since it is called without | ||
2336 | * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done | ||
2337 | * by: | ||
2338 | * | ||
2339 | * exec: is unstable, retry loop | ||
2340 | * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING | ||
2341 | */ | ||
2342 | static inline | ||
2343 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | ||
2344 | { | ||
2345 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | ||
2346 | |||
2347 | /* | ||
2348 | * In order not to call set_task_cpu() on a blocking task we need | ||
2349 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | ||
2350 | * cpu. | ||
2351 | * | ||
2352 | * Since this is common to all placement strategies, this lives here. | ||
2353 | * | ||
2354 | * [ this allows ->select_task() to simply return task_cpu(p) and | ||
2355 | * not worry about this generic constraint ] | ||
2356 | */ | ||
2357 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | ||
2358 | !cpu_online(cpu))) | ||
2359 | cpu = select_fallback_rq(task_cpu(p), p); | ||
2360 | |||
2361 | return cpu; | ||
2362 | } | ||
2363 | #endif | ||
2364 | |||
2336 | /*** | 2365 | /*** |
2337 | * try_to_wake_up - wake up a thread | 2366 | * try_to_wake_up - wake up a thread |
2338 | * @p: the to-be-woken-up thread | 2367 | * @p: the to-be-woken-up thread |
@@ -2352,7 +2381,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2352 | { | 2381 | { |
2353 | int cpu, orig_cpu, this_cpu, success = 0; | 2382 | int cpu, orig_cpu, this_cpu, success = 0; |
2354 | unsigned long flags; | 2383 | unsigned long flags; |
2355 | struct rq *rq, *orig_rq; | 2384 | struct rq *rq; |
2356 | 2385 | ||
2357 | if (is_realtime(p)) | 2386 | if (is_realtime(p)) |
2358 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); | 2387 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); |
@@ -2363,7 +2392,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2363 | this_cpu = get_cpu(); | 2392 | this_cpu = get_cpu(); |
2364 | 2393 | ||
2365 | smp_wmb(); | 2394 | smp_wmb(); |
2366 | rq = orig_rq = task_rq_lock(p, &flags); | 2395 | rq = task_rq_lock(p, &flags); |
2367 | update_rq_clock(rq); | 2396 | update_rq_clock(rq); |
2368 | if (!(p->state & state)) | 2397 | if (!(p->state & state)) |
2369 | goto out; | 2398 | goto out; |
@@ -2387,19 +2416,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2387 | if (task_contributes_to_load(p)) | 2416 | if (task_contributes_to_load(p)) |
2388 | rq->nr_uninterruptible--; | 2417 | rq->nr_uninterruptible--; |
2389 | p->state = TASK_WAKING; | 2418 | p->state = TASK_WAKING; |
2390 | task_rq_unlock(rq, &flags); | ||
2391 | 2419 | ||
2392 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2420 | if (p->sched_class->task_waking) |
2393 | if (cpu != orig_cpu) | 2421 | p->sched_class->task_waking(rq, p); |
2394 | set_task_cpu(p, cpu); | ||
2395 | 2422 | ||
2396 | rq = task_rq_lock(p, &flags); | 2423 | __task_rq_unlock(rq); |
2397 | 2424 | ||
2398 | if (rq != orig_rq) | 2425 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2399 | update_rq_clock(rq); | 2426 | if (cpu != orig_cpu) { |
2427 | /* | ||
2428 | * Since we migrate the task without holding any rq->lock, | ||
2429 | * we need to be careful with task_rq_lock(), since that | ||
2430 | * might end up locking an invalid rq. | ||
2431 | */ | ||
2432 | set_task_cpu(p, cpu); | ||
2433 | } | ||
2400 | 2434 | ||
2435 | rq = cpu_rq(cpu); | ||
2436 | raw_spin_lock(&rq->lock); | ||
2437 | update_rq_clock(rq); | ||
2438 | |||
2439 | /* | ||
2440 | * We migrated the task without holding either rq->lock, however | ||
2441 | * since the task is not on the task list itself, nobody else | ||
2442 | * will try and migrate the task, hence the rq should match the | ||
2443 | * cpu we just moved it to. | ||
2444 | */ | ||
2445 | WARN_ON(task_cpu(p) != cpu); | ||
2401 | WARN_ON(p->state != TASK_WAKING); | 2446 | WARN_ON(p->state != TASK_WAKING); |
2402 | cpu = task_cpu(p); | ||
2403 | 2447 | ||
2404 | #ifdef CONFIG_SCHEDSTATS | 2448 | #ifdef CONFIG_SCHEDSTATS |
2405 | schedstat_inc(rq, ttwu_count); | 2449 | schedstat_inc(rq, ttwu_count); |
@@ -2452,8 +2496,19 @@ out_running: | |||
2452 | 2496 | ||
2453 | p->state = TASK_RUNNING; | 2497 | p->state = TASK_RUNNING; |
2454 | #ifdef CONFIG_SMP | 2498 | #ifdef CONFIG_SMP |
2455 | if (p->sched_class->task_wake_up) | 2499 | if (p->sched_class->task_woken) |
2456 | p->sched_class->task_wake_up(rq, p); | 2500 | p->sched_class->task_woken(rq, p); |
2501 | |||
2502 | if (unlikely(rq->idle_stamp)) { | ||
2503 | u64 delta = rq->clock - rq->idle_stamp; | ||
2504 | u64 max = 2*sysctl_sched_migration_cost; | ||
2505 | |||
2506 | if (delta > max) | ||
2507 | rq->avg_idle = max; | ||
2508 | else | ||
2509 | update_avg(&rq->avg_idle, delta); | ||
2510 | rq->idle_stamp = 0; | ||
2511 | } | ||
2457 | #endif | 2512 | #endif |
2458 | out: | 2513 | out: |
2459 | if (is_realtime(p)) | 2514 | if (is_realtime(p)) |
@@ -2502,7 +2557,6 @@ static void __sched_fork(struct task_struct *p) | |||
2502 | p->se.avg_overlap = 0; | 2557 | p->se.avg_overlap = 0; |
2503 | p->se.start_runtime = 0; | 2558 | p->se.start_runtime = 0; |
2504 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2559 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
2505 | p->se.avg_running = 0; | ||
2506 | 2560 | ||
2507 | #ifdef CONFIG_SCHEDSTATS | 2561 | #ifdef CONFIG_SCHEDSTATS |
2508 | p->se.wait_start = 0; | 2562 | p->se.wait_start = 0; |
@@ -2524,7 +2578,6 @@ static void __sched_fork(struct task_struct *p) | |||
2524 | p->se.nr_failed_migrations_running = 0; | 2578 | p->se.nr_failed_migrations_running = 0; |
2525 | p->se.nr_failed_migrations_hot = 0; | 2579 | p->se.nr_failed_migrations_hot = 0; |
2526 | p->se.nr_forced_migrations = 0; | 2580 | p->se.nr_forced_migrations = 0; |
2527 | p->se.nr_forced2_migrations = 0; | ||
2528 | 2581 | ||
2529 | p->se.nr_wakeups = 0; | 2582 | p->se.nr_wakeups = 0; |
2530 | p->se.nr_wakeups_sync = 0; | 2583 | p->se.nr_wakeups_sync = 0; |
@@ -2545,14 +2598,6 @@ static void __sched_fork(struct task_struct *p) | |||
2545 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2598 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2546 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2599 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
2547 | #endif | 2600 | #endif |
2548 | |||
2549 | /* | ||
2550 | * We mark the process as running here, but have not actually | ||
2551 | * inserted it onto the runqueue yet. This guarantees that | ||
2552 | * nobody will actually run it, and a signal or other external | ||
2553 | * event cannot wake it up and insert it on the runqueue either. | ||
2554 | */ | ||
2555 | p->state = TASK_RUNNING; | ||
2556 | } | 2601 | } |
2557 | 2602 | ||
2558 | /* | 2603 | /* |
@@ -2563,6 +2608,12 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2563 | int cpu = get_cpu(); | 2608 | int cpu = get_cpu(); |
2564 | 2609 | ||
2565 | __sched_fork(p); | 2610 | __sched_fork(p); |
2611 | /* | ||
2612 | * We mark the process as waking here. This guarantees that | ||
2613 | * nobody will actually run it, and a signal or other external | ||
2614 | * event cannot wake it up and insert it on the runqueue either. | ||
2615 | */ | ||
2616 | p->state = TASK_WAKING; | ||
2566 | 2617 | ||
2567 | /* | 2618 | /* |
2568 | * Revert to default priority/policy on fork if requested. | 2619 | * Revert to default priority/policy on fork if requested. |
@@ -2594,9 +2645,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2594 | if (!rt_prio(p->prio)) | 2645 | if (!rt_prio(p->prio)) |
2595 | p->sched_class = &fair_sched_class; | 2646 | p->sched_class = &fair_sched_class; |
2596 | 2647 | ||
2597 | #ifdef CONFIG_SMP | 2648 | if (p->sched_class->task_fork) |
2598 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | 2649 | p->sched_class->task_fork(p); |
2599 | #endif | 2650 | |
2600 | set_task_cpu(p, cpu); | 2651 | set_task_cpu(p, cpu); |
2601 | 2652 | ||
2602 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2653 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
@@ -2626,28 +2677,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2626 | { | 2677 | { |
2627 | unsigned long flags; | 2678 | unsigned long flags; |
2628 | struct rq *rq; | 2679 | struct rq *rq; |
2680 | int cpu __maybe_unused = get_cpu(); | ||
2629 | 2681 | ||
2630 | rq = task_rq_lock(p, &flags); | 2682 | #ifdef CONFIG_SMP |
2631 | BUG_ON(p->state != TASK_RUNNING); | 2683 | /* |
2632 | update_rq_clock(rq); | 2684 | * Fork balancing, do it here and not earlier because: |
2685 | * - cpus_allowed can change in the fork path | ||
2686 | * - any previously selected cpu might disappear through hotplug | ||
2687 | * | ||
2688 | * We still have TASK_WAKING but PF_STARTING is gone now, meaning | ||
2689 | * ->cpus_allowed is stable, we have preemption disabled, meaning | ||
2690 | * cpu_online_mask is stable. | ||
2691 | */ | ||
2692 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2693 | set_task_cpu(p, cpu); | ||
2694 | #endif | ||
2633 | 2695 | ||
2634 | if (!p->sched_class->task_new || !current->se.on_rq) { | 2696 | /* |
2635 | activate_task(rq, p, 0); | 2697 | * Since the task is not on the rq and we still have TASK_WAKING set |
2636 | } else { | 2698 | * nobody else will migrate this task. |
2637 | /* | 2699 | */ |
2638 | * Let the scheduling class do new task startup | 2700 | rq = cpu_rq(cpu); |
2639 | * management (if any): | 2701 | raw_spin_lock_irqsave(&rq->lock, flags); |
2640 | */ | 2702 | |
2641 | p->sched_class->task_new(rq, p); | 2703 | BUG_ON(p->state != TASK_WAKING); |
2642 | inc_nr_running(rq); | 2704 | p->state = TASK_RUNNING; |
2643 | } | 2705 | update_rq_clock(rq); |
2706 | activate_task(rq, p, 0); | ||
2644 | trace_sched_wakeup_new(rq, p, 1); | 2707 | trace_sched_wakeup_new(rq, p, 1); |
2645 | check_preempt_curr(rq, p, WF_FORK); | 2708 | check_preempt_curr(rq, p, WF_FORK); |
2646 | #ifdef CONFIG_SMP | 2709 | #ifdef CONFIG_SMP |
2647 | if (p->sched_class->task_wake_up) | 2710 | if (p->sched_class->task_woken) |
2648 | p->sched_class->task_wake_up(rq, p); | 2711 | p->sched_class->task_woken(rq, p); |
2649 | #endif | 2712 | #endif |
2650 | task_rq_unlock(rq, &flags); | 2713 | task_rq_unlock(rq, &flags); |
2714 | put_cpu(); | ||
2651 | } | 2715 | } |
2652 | 2716 | ||
2653 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2717 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -2768,7 +2832,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2768 | finish_arch_switch(prev); | 2832 | finish_arch_switch(prev); |
2769 | litmus->finish_switch(prev); | 2833 | litmus->finish_switch(prev); |
2770 | prev->rt_param.stack_in_use = NO_CPU; | 2834 | prev->rt_param.stack_in_use = NO_CPU; |
2771 | perf_event_task_sched_in(current, cpu_of(rq)); | 2835 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2836 | local_irq_disable(); | ||
2837 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2838 | perf_event_task_sched_in(current); | ||
2839 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
2840 | local_irq_enable(); | ||
2841 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2772 | finish_lock_switch(rq, prev); | 2842 | finish_lock_switch(rq, prev); |
2773 | 2843 | ||
2774 | fire_sched_in_preempt_notifiers(current); | 2844 | fire_sched_in_preempt_notifiers(current); |
@@ -2808,10 +2878,10 @@ static inline void post_schedule(struct rq *rq) | |||
2808 | if (rq->post_schedule) { | 2878 | if (rq->post_schedule) { |
2809 | unsigned long flags; | 2879 | unsigned long flags; |
2810 | 2880 | ||
2811 | spin_lock_irqsave(&rq->lock, flags); | 2881 | raw_spin_lock_irqsave(&rq->lock, flags); |
2812 | if (rq->curr->sched_class->post_schedule) | 2882 | if (rq->curr->sched_class->post_schedule) |
2813 | rq->curr->sched_class->post_schedule(rq); | 2883 | rq->curr->sched_class->post_schedule(rq); |
2814 | spin_unlock_irqrestore(&rq->lock, flags); | 2884 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
2815 | 2885 | ||
2816 | rq->post_schedule = 0; | 2886 | rq->post_schedule = 0; |
2817 | } | 2887 | } |
@@ -2875,14 +2945,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2875 | */ | 2945 | */ |
2876 | arch_start_context_switch(prev); | 2946 | arch_start_context_switch(prev); |
2877 | 2947 | ||
2878 | if (unlikely(!mm)) { | 2948 | if (likely(!mm)) { |
2879 | next->active_mm = oldmm; | 2949 | next->active_mm = oldmm; |
2880 | atomic_inc(&oldmm->mm_count); | 2950 | atomic_inc(&oldmm->mm_count); |
2881 | enter_lazy_tlb(oldmm, next); | 2951 | enter_lazy_tlb(oldmm, next); |
2882 | } else | 2952 | } else |
2883 | switch_mm(oldmm, mm, next); | 2953 | switch_mm(oldmm, mm, next); |
2884 | 2954 | ||
2885 | if (unlikely(!prev->mm)) { | 2955 | if (likely(!prev->mm)) { |
2886 | prev->active_mm = NULL; | 2956 | prev->active_mm = NULL; |
2887 | rq->prev_mm = oldmm; | 2957 | rq->prev_mm = oldmm; |
2888 | } | 2958 | } |
@@ -3045,15 +3115,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
3045 | } | 3115 | } |
3046 | 3116 | ||
3047 | /* | 3117 | /* |
3048 | * Externally visible per-cpu scheduler statistics: | ||
3049 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
3050 | */ | ||
3051 | u64 cpu_nr_migrations(int cpu) | ||
3052 | { | ||
3053 | return cpu_rq(cpu)->nr_migrations_in; | ||
3054 | } | ||
3055 | |||
3056 | /* | ||
3057 | * Update rq->cpu_load[] statistics. This function is usually called every | 3118 | * Update rq->cpu_load[] statistics. This function is usually called every |
3058 | * scheduler tick (TICK_NSEC). | 3119 | * scheduler tick (TICK_NSEC). |
3059 | */ | 3120 | */ |
@@ -3091,65 +3152,36 @@ static void update_cpu_load(struct rq *this_rq) | |||
3091 | #ifdef CONFIG_SMP | 3152 | #ifdef CONFIG_SMP |
3092 | 3153 | ||
3093 | /* | 3154 | /* |
3094 | * double_rq_lock - safely lock two runqueues | 3155 | * sched_exec - execve() is a valuable balancing opportunity, because at |
3095 | * | 3156 | * this point the task has the smallest effective memory and cache footprint. |
3096 | * Note this does not disable interrupts like task_rq_lock, | ||
3097 | * you need to do so manually before calling. | ||
3098 | */ | ||
3099 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
3100 | __acquires(rq1->lock) | ||
3101 | __acquires(rq2->lock) | ||
3102 | { | ||
3103 | BUG_ON(!irqs_disabled()); | ||
3104 | if (rq1 == rq2) { | ||
3105 | spin_lock(&rq1->lock); | ||
3106 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
3107 | } else { | ||
3108 | if (rq1 < rq2) { | ||
3109 | spin_lock(&rq1->lock); | ||
3110 | spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
3111 | } else { | ||
3112 | spin_lock(&rq2->lock); | ||
3113 | spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
3114 | } | ||
3115 | } | ||
3116 | update_rq_clock(rq1); | ||
3117 | update_rq_clock(rq2); | ||
3118 | } | ||
3119 | |||
3120 | /* | ||
3121 | * double_rq_unlock - safely unlock two runqueues | ||
3122 | * | ||
3123 | * Note this does not restore interrupts like task_rq_unlock, | ||
3124 | * you need to do so manually after calling. | ||
3125 | */ | ||
3126 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
3127 | __releases(rq1->lock) | ||
3128 | __releases(rq2->lock) | ||
3129 | { | ||
3130 | spin_unlock(&rq1->lock); | ||
3131 | if (rq1 != rq2) | ||
3132 | spin_unlock(&rq2->lock); | ||
3133 | else | ||
3134 | __release(rq2->lock); | ||
3135 | } | ||
3136 | |||
3137 | /* | ||
3138 | * If dest_cpu is allowed for this process, migrate the task to it. | ||
3139 | * This is accomplished by forcing the cpu_allowed mask to only | ||
3140 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | ||
3141 | * the cpu_allowed mask is restored. | ||
3142 | */ | 3157 | */ |
3143 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 3158 | void sched_exec(void) |
3144 | { | 3159 | { |
3160 | struct task_struct *p = current; | ||
3145 | struct migration_req req; | 3161 | struct migration_req req; |
3162 | int dest_cpu, this_cpu; | ||
3146 | unsigned long flags; | 3163 | unsigned long flags; |
3147 | struct rq *rq; | 3164 | struct rq *rq; |
3148 | 3165 | ||
3166 | again: | ||
3167 | this_cpu = get_cpu(); | ||
3168 | dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); | ||
3169 | if (dest_cpu == this_cpu) { | ||
3170 | put_cpu(); | ||
3171 | return; | ||
3172 | } | ||
3173 | |||
3149 | rq = task_rq_lock(p, &flags); | 3174 | rq = task_rq_lock(p, &flags); |
3175 | put_cpu(); | ||
3176 | |||
3177 | /* | ||
3178 | * select_task_rq() can race against ->cpus_allowed | ||
3179 | */ | ||
3150 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) | 3180 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) |
3151 | || unlikely(!cpu_active(dest_cpu))) | 3181 | || unlikely(!cpu_active(dest_cpu))) { |
3152 | goto out; | 3182 | task_rq_unlock(rq, &flags); |
3183 | goto again; | ||
3184 | } | ||
3153 | 3185 | ||
3154 | /* force the process onto the specified CPU */ | 3186 | /* force the process onto the specified CPU */ |
3155 | if (migrate_task(p, dest_cpu, &req)) { | 3187 | if (migrate_task(p, dest_cpu, &req)) { |
@@ -3164,1784 +3196,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) | |||
3164 | 3196 | ||
3165 | return; | 3197 | return; |
3166 | } | 3198 | } |
3167 | out: | ||
3168 | task_rq_unlock(rq, &flags); | 3199 | task_rq_unlock(rq, &flags); |
3169 | } | 3200 | } |
3170 | 3201 | ||
3171 | /* | ||
3172 | * sched_exec - execve() is a valuable balancing opportunity, because at | ||
3173 | * this point the task has the smallest effective memory and cache footprint. | ||
3174 | */ | ||
3175 | void sched_exec(void) | ||
3176 | { | ||
3177 | int new_cpu, this_cpu = get_cpu(); | ||
3178 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); | ||
3179 | put_cpu(); | ||
3180 | if (new_cpu != this_cpu) | ||
3181 | sched_migrate_task(current, new_cpu); | ||
3182 | } | ||
3183 | |||
3184 | /* | ||
3185 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
3186 | * Both runqueues must be locked. | ||
3187 | */ | ||
3188 | static void pull_task(struct rq *src_rq, struct task_struct *p, | ||
3189 | struct rq *this_rq, int this_cpu) | ||
3190 | { | ||
3191 | deactivate_task(src_rq, p, 0); | ||
3192 | set_task_cpu(p, this_cpu); | ||
3193 | activate_task(this_rq, p, 0); | ||
3194 | /* | ||
3195 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
3196 | * to be always true for them. | ||
3197 | */ | ||
3198 | check_preempt_curr(this_rq, p, 0); | ||
3199 | } | ||
3200 | |||
3201 | /* | ||
3202 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
3203 | */ | ||
3204 | static | ||
3205 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
3206 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3207 | int *all_pinned) | ||
3208 | { | ||
3209 | int tsk_cache_hot = 0; | ||
3210 | /* | ||
3211 | * We do not migrate tasks that are: | ||
3212 | * 1) running (obviously), or | ||
3213 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
3214 | * 3) are cache-hot on their current CPU. | ||
3215 | */ | ||
3216 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
3217 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
3218 | return 0; | ||
3219 | } | ||
3220 | *all_pinned = 0; | ||
3221 | |||
3222 | if (task_running(rq, p)) { | ||
3223 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
3224 | return 0; | ||
3225 | } | ||
3226 | |||
3227 | /* | ||
3228 | * Aggressive migration if: | ||
3229 | * 1) task is cache cold, or | ||
3230 | * 2) too many balance attempts have failed. | ||
3231 | */ | ||
3232 | |||
3233 | tsk_cache_hot = task_hot(p, rq->clock, sd); | ||
3234 | if (!tsk_cache_hot || | ||
3235 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
3236 | #ifdef CONFIG_SCHEDSTATS | ||
3237 | if (tsk_cache_hot) { | ||
3238 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
3239 | schedstat_inc(p, se.nr_forced_migrations); | ||
3240 | } | ||
3241 | #endif | ||
3242 | return 1; | ||
3243 | } | ||
3244 | |||
3245 | if (tsk_cache_hot) { | ||
3246 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
3247 | return 0; | ||
3248 | } | ||
3249 | return 1; | ||
3250 | } | ||
3251 | |||
3252 | static unsigned long | ||
3253 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3254 | unsigned long max_load_move, struct sched_domain *sd, | ||
3255 | enum cpu_idle_type idle, int *all_pinned, | ||
3256 | int *this_best_prio, struct rq_iterator *iterator) | ||
3257 | { | ||
3258 | int loops = 0, pulled = 0, pinned = 0; | ||
3259 | struct task_struct *p; | ||
3260 | long rem_load_move = max_load_move; | ||
3261 | |||
3262 | if (max_load_move == 0) | ||
3263 | goto out; | ||
3264 | |||
3265 | pinned = 1; | ||
3266 | |||
3267 | /* | ||
3268 | * Start the load-balancing iterator: | ||
3269 | */ | ||
3270 | p = iterator->start(iterator->arg); | ||
3271 | next: | ||
3272 | if (!p || loops++ > sysctl_sched_nr_migrate) | ||
3273 | goto out; | ||
3274 | |||
3275 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
3276 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3277 | p = iterator->next(iterator->arg); | ||
3278 | goto next; | ||
3279 | } | ||
3280 | |||
3281 | pull_task(busiest, p, this_rq, this_cpu); | ||
3282 | pulled++; | ||
3283 | rem_load_move -= p->se.load.weight; | ||
3284 | |||
3285 | #ifdef CONFIG_PREEMPT | ||
3286 | /* | ||
3287 | * NEWIDLE balancing is a source of latency, so preemptible kernels | ||
3288 | * will stop after the first task is pulled to minimize the critical | ||
3289 | * section. | ||
3290 | */ | ||
3291 | if (idle == CPU_NEWLY_IDLE) | ||
3292 | goto out; | ||
3293 | #endif | ||
3294 | |||
3295 | /* | ||
3296 | * We only want to steal up to the prescribed amount of weighted load. | ||
3297 | */ | ||
3298 | if (rem_load_move > 0) { | ||
3299 | if (p->prio < *this_best_prio) | ||
3300 | *this_best_prio = p->prio; | ||
3301 | p = iterator->next(iterator->arg); | ||
3302 | goto next; | ||
3303 | } | ||
3304 | out: | ||
3305 | /* | ||
3306 | * Right now, this is one of only two places pull_task() is called, | ||
3307 | * so we can safely collect pull_task() stats here rather than | ||
3308 | * inside pull_task(). | ||
3309 | */ | ||
3310 | schedstat_add(sd, lb_gained[idle], pulled); | ||
3311 | |||
3312 | if (all_pinned) | ||
3313 | *all_pinned = pinned; | ||
3314 | |||
3315 | return max_load_move - rem_load_move; | ||
3316 | } | ||
3317 | |||
3318 | /* | ||
3319 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3320 | * this_rq, as part of a balancing operation within domain "sd". | ||
3321 | * Returns 1 if successful and 0 otherwise. | ||
3322 | * | ||
3323 | * Called with both runqueues locked. | ||
3324 | */ | ||
3325 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3326 | unsigned long max_load_move, | ||
3327 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3328 | int *all_pinned) | ||
3329 | { | ||
3330 | const struct sched_class *class = sched_class_highest; | ||
3331 | unsigned long total_load_moved = 0; | ||
3332 | int this_best_prio = this_rq->curr->prio; | ||
3333 | |||
3334 | do { | ||
3335 | total_load_moved += | ||
3336 | class->load_balance(this_rq, this_cpu, busiest, | ||
3337 | max_load_move - total_load_moved, | ||
3338 | sd, idle, all_pinned, &this_best_prio); | ||
3339 | class = class->next; | ||
3340 | |||
3341 | #ifdef CONFIG_PREEMPT | ||
3342 | /* | ||
3343 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3344 | * kernels will stop after the first task is pulled to minimize | ||
3345 | * the critical section. | ||
3346 | */ | ||
3347 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3348 | break; | ||
3349 | #endif | ||
3350 | } while (class && max_load_move > total_load_moved); | ||
3351 | |||
3352 | return total_load_moved > 0; | ||
3353 | } | ||
3354 | |||
3355 | static int | ||
3356 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3357 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3358 | struct rq_iterator *iterator) | ||
3359 | { | ||
3360 | struct task_struct *p = iterator->start(iterator->arg); | ||
3361 | int pinned = 0; | ||
3362 | |||
3363 | while (p) { | ||
3364 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3365 | pull_task(busiest, p, this_rq, this_cpu); | ||
3366 | /* | ||
3367 | * Right now, this is only the second place pull_task() | ||
3368 | * is called, so we can safely collect pull_task() | ||
3369 | * stats here rather than inside pull_task(). | ||
3370 | */ | ||
3371 | schedstat_inc(sd, lb_gained[idle]); | ||
3372 | |||
3373 | return 1; | ||
3374 | } | ||
3375 | p = iterator->next(iterator->arg); | ||
3376 | } | ||
3377 | |||
3378 | return 0; | ||
3379 | } | ||
3380 | |||
3381 | /* | ||
3382 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
3383 | * part of active balancing operations within "domain". | ||
3384 | * Returns 1 if successful and 0 otherwise. | ||
3385 | * | ||
3386 | * Called with both runqueues locked. | ||
3387 | */ | ||
3388 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3389 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3390 | { | ||
3391 | const struct sched_class *class; | ||
3392 | |||
3393 | for_each_class(class) { | ||
3394 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | ||
3395 | return 1; | ||
3396 | } | ||
3397 | |||
3398 | return 0; | ||
3399 | } | ||
3400 | /********** Helpers for find_busiest_group ************************/ | ||
3401 | /* | ||
3402 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
3403 | * during load balancing. | ||
3404 | */ | ||
3405 | struct sd_lb_stats { | ||
3406 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
3407 | struct sched_group *this; /* Local group in this sd */ | ||
3408 | unsigned long total_load; /* Total load of all groups in sd */ | ||
3409 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
3410 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
3411 | |||
3412 | /** Statistics of this group */ | ||
3413 | unsigned long this_load; | ||
3414 | unsigned long this_load_per_task; | ||
3415 | unsigned long this_nr_running; | ||
3416 | |||
3417 | /* Statistics of the busiest group */ | ||
3418 | unsigned long max_load; | ||
3419 | unsigned long busiest_load_per_task; | ||
3420 | unsigned long busiest_nr_running; | ||
3421 | |||
3422 | int group_imb; /* Is there imbalance in this sd */ | ||
3423 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3424 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3425 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3426 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3427 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3428 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3429 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3430 | #endif | ||
3431 | }; | ||
3432 | |||
3433 | /* | ||
3434 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
3435 | */ | ||
3436 | struct sg_lb_stats { | ||
3437 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
3438 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
3439 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
3440 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
3441 | unsigned long group_capacity; | ||
3442 | int group_imb; /* Is there an imbalance in the group ? */ | ||
3443 | }; | ||
3444 | |||
3445 | /** | ||
3446 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3447 | * @group: The group whose first cpu is to be returned. | ||
3448 | */ | ||
3449 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
3450 | { | ||
3451 | return cpumask_first(sched_group_cpus(group)); | ||
3452 | } | ||
3453 | |||
3454 | /** | ||
3455 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
3456 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
3457 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
3458 | */ | ||
3459 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
3460 | enum cpu_idle_type idle) | ||
3461 | { | ||
3462 | int load_idx; | ||
3463 | |||
3464 | switch (idle) { | ||
3465 | case CPU_NOT_IDLE: | ||
3466 | load_idx = sd->busy_idx; | ||
3467 | break; | ||
3468 | |||
3469 | case CPU_NEWLY_IDLE: | ||
3470 | load_idx = sd->newidle_idx; | ||
3471 | break; | ||
3472 | default: | ||
3473 | load_idx = sd->idle_idx; | ||
3474 | break; | ||
3475 | } | ||
3476 | |||
3477 | return load_idx; | ||
3478 | } | ||
3479 | |||
3480 | |||
3481 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3482 | /** | ||
3483 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3484 | * the given sched_domain, during load balancing. | ||
3485 | * | ||
3486 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3487 | * @sds: Variable containing the statistics for sd. | ||
3488 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3489 | */ | ||
3490 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3491 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3492 | { | ||
3493 | /* | ||
3494 | * Busy processors will not participate in power savings | ||
3495 | * balance. | ||
3496 | */ | ||
3497 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3498 | sds->power_savings_balance = 0; | ||
3499 | else { | ||
3500 | sds->power_savings_balance = 1; | ||
3501 | sds->min_nr_running = ULONG_MAX; | ||
3502 | sds->leader_nr_running = 0; | ||
3503 | } | ||
3504 | } | ||
3505 | |||
3506 | /** | ||
3507 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3508 | * sched_domain while performing load balancing. | ||
3509 | * | ||
3510 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3511 | * @sds: Variable containing the statistics of the sched_domain | ||
3512 | * @local_group: Does group contain the CPU for which we're performing | ||
3513 | * load balancing ? | ||
3514 | * @sgs: Variable containing the statistics of the group. | ||
3515 | */ | ||
3516 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3517 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3518 | { | ||
3519 | |||
3520 | if (!sds->power_savings_balance) | ||
3521 | return; | ||
3522 | |||
3523 | /* | ||
3524 | * If the local group is idle or completely loaded | ||
3525 | * no need to do power savings balance at this domain | ||
3526 | */ | ||
3527 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3528 | !sds->this_nr_running)) | ||
3529 | sds->power_savings_balance = 0; | ||
3530 | |||
3531 | /* | ||
3532 | * If a group is already running at full capacity or idle, | ||
3533 | * don't include that group in power savings calculations | ||
3534 | */ | ||
3535 | if (!sds->power_savings_balance || | ||
3536 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3537 | !sgs->sum_nr_running) | ||
3538 | return; | ||
3539 | |||
3540 | /* | ||
3541 | * Calculate the group which has the least non-idle load. | ||
3542 | * This is the group from where we need to pick up the load | ||
3543 | * for saving power | ||
3544 | */ | ||
3545 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3546 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3547 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3548 | sds->group_min = group; | ||
3549 | sds->min_nr_running = sgs->sum_nr_running; | ||
3550 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3551 | sgs->sum_nr_running; | ||
3552 | } | ||
3553 | |||
3554 | /* | ||
3555 | * Calculate the group which is almost near its | ||
3556 | * capacity but still has some space to pick up some load | ||
3557 | * from other group and save more power | ||
3558 | */ | ||
3559 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3560 | return; | ||
3561 | |||
3562 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3563 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3564 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3565 | sds->group_leader = group; | ||
3566 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3567 | } | ||
3568 | } | ||
3569 | |||
3570 | /** | ||
3571 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3572 | * @sds: Variable containing the statistics of the sched_domain | ||
3573 | * under consideration. | ||
3574 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3575 | * @imbalance: Variable to store the imbalance. | ||
3576 | * | ||
3577 | * Description: | ||
3578 | * Check if we have potential to perform some power-savings balance. | ||
3579 | * If yes, set the busiest group to be the least loaded group in the | ||
3580 | * sched_domain, so that it's CPUs can be put to idle. | ||
3581 | * | ||
3582 | * Returns 1 if there is potential to perform power-savings balance. | ||
3583 | * Else returns 0. | ||
3584 | */ | ||
3585 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3586 | int this_cpu, unsigned long *imbalance) | ||
3587 | { | ||
3588 | if (!sds->power_savings_balance) | ||
3589 | return 0; | ||
3590 | |||
3591 | if (sds->this != sds->group_leader || | ||
3592 | sds->group_leader == sds->group_min) | ||
3593 | return 0; | ||
3594 | |||
3595 | *imbalance = sds->min_load_per_task; | ||
3596 | sds->busiest = sds->group_min; | ||
3597 | |||
3598 | return 1; | ||
3599 | |||
3600 | } | ||
3601 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3602 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3603 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3604 | { | ||
3605 | return; | ||
3606 | } | ||
3607 | |||
3608 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3609 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3610 | { | ||
3611 | return; | ||
3612 | } | ||
3613 | |||
3614 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3615 | int this_cpu, unsigned long *imbalance) | ||
3616 | { | ||
3617 | return 0; | ||
3618 | } | ||
3619 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3620 | |||
3621 | |||
3622 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3623 | { | ||
3624 | return SCHED_LOAD_SCALE; | ||
3625 | } | ||
3626 | |||
3627 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3628 | { | ||
3629 | return default_scale_freq_power(sd, cpu); | ||
3630 | } | ||
3631 | |||
3632 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3633 | { | ||
3634 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3635 | unsigned long smt_gain = sd->smt_gain; | ||
3636 | |||
3637 | smt_gain /= weight; | ||
3638 | |||
3639 | return smt_gain; | ||
3640 | } | ||
3641 | |||
3642 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3643 | { | ||
3644 | return default_scale_smt_power(sd, cpu); | ||
3645 | } | ||
3646 | |||
3647 | unsigned long scale_rt_power(int cpu) | ||
3648 | { | ||
3649 | struct rq *rq = cpu_rq(cpu); | ||
3650 | u64 total, available; | ||
3651 | |||
3652 | sched_avg_update(rq); | ||
3653 | |||
3654 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3655 | available = total - rq->rt_avg; | ||
3656 | |||
3657 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3658 | total = SCHED_LOAD_SCALE; | ||
3659 | |||
3660 | total >>= SCHED_LOAD_SHIFT; | ||
3661 | |||
3662 | return div_u64(available, total); | ||
3663 | } | ||
3664 | |||
3665 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3666 | { | ||
3667 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3668 | unsigned long power = SCHED_LOAD_SCALE; | ||
3669 | struct sched_group *sdg = sd->groups; | ||
3670 | |||
3671 | if (sched_feat(ARCH_POWER)) | ||
3672 | power *= arch_scale_freq_power(sd, cpu); | ||
3673 | else | ||
3674 | power *= default_scale_freq_power(sd, cpu); | ||
3675 | |||
3676 | power >>= SCHED_LOAD_SHIFT; | ||
3677 | |||
3678 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3679 | if (sched_feat(ARCH_POWER)) | ||
3680 | power *= arch_scale_smt_power(sd, cpu); | ||
3681 | else | ||
3682 | power *= default_scale_smt_power(sd, cpu); | ||
3683 | |||
3684 | power >>= SCHED_LOAD_SHIFT; | ||
3685 | } | ||
3686 | |||
3687 | power *= scale_rt_power(cpu); | ||
3688 | power >>= SCHED_LOAD_SHIFT; | ||
3689 | |||
3690 | if (!power) | ||
3691 | power = 1; | ||
3692 | |||
3693 | sdg->cpu_power = power; | ||
3694 | } | ||
3695 | |||
3696 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3697 | { | ||
3698 | struct sched_domain *child = sd->child; | ||
3699 | struct sched_group *group, *sdg = sd->groups; | ||
3700 | unsigned long power; | ||
3701 | |||
3702 | if (!child) { | ||
3703 | update_cpu_power(sd, cpu); | ||
3704 | return; | ||
3705 | } | ||
3706 | |||
3707 | power = 0; | ||
3708 | |||
3709 | group = child->groups; | ||
3710 | do { | ||
3711 | power += group->cpu_power; | ||
3712 | group = group->next; | ||
3713 | } while (group != child->groups); | ||
3714 | |||
3715 | sdg->cpu_power = power; | ||
3716 | } | ||
3717 | |||
3718 | /** | ||
3719 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
3720 | * @sd: The sched_domain whose statistics are to be updated. | ||
3721 | * @group: sched_group whose statistics are to be updated. | ||
3722 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3723 | * @idle: Idle status of this_cpu | ||
3724 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
3725 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3726 | * @local_group: Does group contain this_cpu. | ||
3727 | * @cpus: Set of cpus considered for load balancing. | ||
3728 | * @balance: Should we balance. | ||
3729 | * @sgs: variable to hold the statistics for this group. | ||
3730 | */ | ||
3731 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
3732 | struct sched_group *group, int this_cpu, | ||
3733 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
3734 | int local_group, const struct cpumask *cpus, | ||
3735 | int *balance, struct sg_lb_stats *sgs) | ||
3736 | { | ||
3737 | unsigned long load, max_cpu_load, min_cpu_load; | ||
3738 | int i; | ||
3739 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
3740 | unsigned long sum_avg_load_per_task; | ||
3741 | unsigned long avg_load_per_task; | ||
3742 | |||
3743 | if (local_group) { | ||
3744 | balance_cpu = group_first_cpu(group); | ||
3745 | if (balance_cpu == this_cpu) | ||
3746 | update_group_power(sd, this_cpu); | ||
3747 | } | ||
3748 | |||
3749 | /* Tally up the load of all CPUs in the group */ | ||
3750 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3751 | max_cpu_load = 0; | ||
3752 | min_cpu_load = ~0UL; | ||
3753 | |||
3754 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
3755 | struct rq *rq = cpu_rq(i); | ||
3756 | |||
3757 | if (*sd_idle && rq->nr_running) | ||
3758 | *sd_idle = 0; | ||
3759 | |||
3760 | /* Bias balancing toward cpus of our domain */ | ||
3761 | if (local_group) { | ||
3762 | if (idle_cpu(i) && !first_idle_cpu) { | ||
3763 | first_idle_cpu = 1; | ||
3764 | balance_cpu = i; | ||
3765 | } | ||
3766 | |||
3767 | load = target_load(i, load_idx); | ||
3768 | } else { | ||
3769 | load = source_load(i, load_idx); | ||
3770 | if (load > max_cpu_load) | ||
3771 | max_cpu_load = load; | ||
3772 | if (min_cpu_load > load) | ||
3773 | min_cpu_load = load; | ||
3774 | } | ||
3775 | |||
3776 | sgs->group_load += load; | ||
3777 | sgs->sum_nr_running += rq->nr_running; | ||
3778 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
3779 | |||
3780 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3781 | } | ||
3782 | |||
3783 | /* | ||
3784 | * First idle cpu or the first cpu(busiest) in this sched group | ||
3785 | * is eligible for doing load balancing at this and above | ||
3786 | * domains. In the newly idle case, we will allow all the cpu's | ||
3787 | * to do the newly idle load balance. | ||
3788 | */ | ||
3789 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
3790 | balance_cpu != this_cpu && balance) { | ||
3791 | *balance = 0; | ||
3792 | return; | ||
3793 | } | ||
3794 | |||
3795 | /* Adjust by relative CPU power of the group */ | ||
3796 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
3797 | |||
3798 | |||
3799 | /* | ||
3800 | * Consider the group unbalanced when the imbalance is larger | ||
3801 | * than the average weight of two tasks. | ||
3802 | * | ||
3803 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3804 | * might not be a suitable number - should we keep a | ||
3805 | * normalized nr_running number somewhere that negates | ||
3806 | * the hierarchy? | ||
3807 | */ | ||
3808 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | ||
3809 | group->cpu_power; | ||
3810 | |||
3811 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3812 | sgs->group_imb = 1; | ||
3813 | |||
3814 | sgs->group_capacity = | ||
3815 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
3816 | } | ||
3817 | |||
3818 | /** | ||
3819 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
3820 | * @sd: sched_domain whose statistics are to be updated. | ||
3821 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3822 | * @idle: Idle status of this_cpu | ||
3823 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3824 | * @cpus: Set of cpus considered for load balancing. | ||
3825 | * @balance: Should we balance. | ||
3826 | * @sds: variable to hold the statistics for this sched_domain. | ||
3827 | */ | ||
3828 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
3829 | enum cpu_idle_type idle, int *sd_idle, | ||
3830 | const struct cpumask *cpus, int *balance, | ||
3831 | struct sd_lb_stats *sds) | ||
3832 | { | ||
3833 | struct sched_domain *child = sd->child; | ||
3834 | struct sched_group *group = sd->groups; | ||
3835 | struct sg_lb_stats sgs; | ||
3836 | int load_idx, prefer_sibling = 0; | ||
3837 | |||
3838 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3839 | prefer_sibling = 1; | ||
3840 | |||
3841 | init_sd_power_savings_stats(sd, sds, idle); | ||
3842 | load_idx = get_sd_load_idx(sd, idle); | ||
3843 | |||
3844 | do { | ||
3845 | int local_group; | ||
3846 | |||
3847 | local_group = cpumask_test_cpu(this_cpu, | ||
3848 | sched_group_cpus(group)); | ||
3849 | memset(&sgs, 0, sizeof(sgs)); | ||
3850 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
3851 | local_group, cpus, balance, &sgs); | ||
3852 | |||
3853 | if (local_group && balance && !(*balance)) | ||
3854 | return; | ||
3855 | |||
3856 | sds->total_load += sgs.group_load; | ||
3857 | sds->total_pwr += group->cpu_power; | ||
3858 | |||
3859 | /* | ||
3860 | * In case the child domain prefers tasks go to siblings | ||
3861 | * first, lower the group capacity to one so that we'll try | ||
3862 | * and move all the excess tasks away. | ||
3863 | */ | ||
3864 | if (prefer_sibling) | ||
3865 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3866 | |||
3867 | if (local_group) { | ||
3868 | sds->this_load = sgs.avg_load; | ||
3869 | sds->this = group; | ||
3870 | sds->this_nr_running = sgs.sum_nr_running; | ||
3871 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
3872 | } else if (sgs.avg_load > sds->max_load && | ||
3873 | (sgs.sum_nr_running > sgs.group_capacity || | ||
3874 | sgs.group_imb)) { | ||
3875 | sds->max_load = sgs.avg_load; | ||
3876 | sds->busiest = group; | ||
3877 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
3878 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
3879 | sds->group_imb = sgs.group_imb; | ||
3880 | } | ||
3881 | |||
3882 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
3883 | group = group->next; | ||
3884 | } while (group != sd->groups); | ||
3885 | } | ||
3886 | |||
3887 | /** | ||
3888 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
3889 | * amongst the groups of a sched_domain, during | ||
3890 | * load balancing. | ||
3891 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
3892 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3893 | * @imbalance: Variable to store the imbalance. | ||
3894 | */ | ||
3895 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
3896 | int this_cpu, unsigned long *imbalance) | ||
3897 | { | ||
3898 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
3899 | unsigned int imbn = 2; | ||
3900 | |||
3901 | if (sds->this_nr_running) { | ||
3902 | sds->this_load_per_task /= sds->this_nr_running; | ||
3903 | if (sds->busiest_load_per_task > | ||
3904 | sds->this_load_per_task) | ||
3905 | imbn = 1; | ||
3906 | } else | ||
3907 | sds->this_load_per_task = | ||
3908 | cpu_avg_load_per_task(this_cpu); | ||
3909 | |||
3910 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | ||
3911 | sds->busiest_load_per_task * imbn) { | ||
3912 | *imbalance = sds->busiest_load_per_task; | ||
3913 | return; | ||
3914 | } | ||
3915 | |||
3916 | /* | ||
3917 | * OK, we don't have enough imbalance to justify moving tasks, | ||
3918 | * however we may be able to increase total CPU power used by | ||
3919 | * moving them. | ||
3920 | */ | ||
3921 | |||
3922 | pwr_now += sds->busiest->cpu_power * | ||
3923 | min(sds->busiest_load_per_task, sds->max_load); | ||
3924 | pwr_now += sds->this->cpu_power * | ||
3925 | min(sds->this_load_per_task, sds->this_load); | ||
3926 | pwr_now /= SCHED_LOAD_SCALE; | ||
3927 | |||
3928 | /* Amount of load we'd subtract */ | ||
3929 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3930 | sds->busiest->cpu_power; | ||
3931 | if (sds->max_load > tmp) | ||
3932 | pwr_move += sds->busiest->cpu_power * | ||
3933 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
3934 | |||
3935 | /* Amount of load we'd add */ | ||
3936 | if (sds->max_load * sds->busiest->cpu_power < | ||
3937 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
3938 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
3939 | sds->this->cpu_power; | ||
3940 | else | ||
3941 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3942 | sds->this->cpu_power; | ||
3943 | pwr_move += sds->this->cpu_power * | ||
3944 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
3945 | pwr_move /= SCHED_LOAD_SCALE; | ||
3946 | |||
3947 | /* Move if we gain throughput */ | ||
3948 | if (pwr_move > pwr_now) | ||
3949 | *imbalance = sds->busiest_load_per_task; | ||
3950 | } | ||
3951 | |||
3952 | /** | ||
3953 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
3954 | * groups of a given sched_domain during load balance. | ||
3955 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
3956 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
3957 | * @imbalance: The variable to store the imbalance. | ||
3958 | */ | ||
3959 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
3960 | unsigned long *imbalance) | ||
3961 | { | ||
3962 | unsigned long max_pull; | ||
3963 | /* | ||
3964 | * In the presence of smp nice balancing, certain scenarios can have | ||
3965 | * max load less than avg load(as we skip the groups at or below | ||
3966 | * its cpu_power, while calculating max_load..) | ||
3967 | */ | ||
3968 | if (sds->max_load < sds->avg_load) { | ||
3969 | *imbalance = 0; | ||
3970 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
3971 | } | ||
3972 | |||
3973 | /* Don't want to pull so many tasks that a group would go idle */ | ||
3974 | max_pull = min(sds->max_load - sds->avg_load, | ||
3975 | sds->max_load - sds->busiest_load_per_task); | ||
3976 | |||
3977 | /* How much load to actually move to equalise the imbalance */ | ||
3978 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
3979 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
3980 | / SCHED_LOAD_SCALE; | ||
3981 | |||
3982 | /* | ||
3983 | * if *imbalance is less than the average load per runnable task | ||
3984 | * there is no gaurantee that any tasks will be moved so we'll have | ||
3985 | * a think about bumping its value to force at least one task to be | ||
3986 | * moved | ||
3987 | */ | ||
3988 | if (*imbalance < sds->busiest_load_per_task) | ||
3989 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
3990 | |||
3991 | } | ||
3992 | /******* find_busiest_group() helpers end here *********************/ | ||
3993 | |||
3994 | /** | ||
3995 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
3996 | * if there is an imbalance. If there isn't an imbalance, and | ||
3997 | * the user has opted for power-savings, it returns a group whose | ||
3998 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
3999 | * such a group exists. | ||
4000 | * | ||
4001 | * Also calculates the amount of weighted load which should be moved | ||
4002 | * to restore balance. | ||
4003 | * | ||
4004 | * @sd: The sched_domain whose busiest group is to be returned. | ||
4005 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4006 | * @imbalance: Variable which stores amount of weighted load which should | ||
4007 | * be moved to restore balance/put a group to idle. | ||
4008 | * @idle: The idle status of this_cpu. | ||
4009 | * @sd_idle: The idleness of sd | ||
4010 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
4011 | * @balance: Pointer to a variable indicating if this_cpu | ||
4012 | * is the appropriate cpu to perform load balancing at this_level. | ||
4013 | * | ||
4014 | * Returns: - the busiest group if imbalance exists. | ||
4015 | * - If no imbalance and user has opted for power-savings balance, | ||
4016 | * return the least loaded group whose CPUs can be | ||
4017 | * put to idle by rebalancing its tasks onto our group. | ||
4018 | */ | ||
4019 | static struct sched_group * | ||
4020 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
4021 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4022 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
4023 | { | ||
4024 | struct sd_lb_stats sds; | ||
4025 | |||
4026 | memset(&sds, 0, sizeof(sds)); | ||
4027 | |||
4028 | /* | ||
4029 | * Compute the various statistics relavent for load balancing at | ||
4030 | * this level. | ||
4031 | */ | ||
4032 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
4033 | balance, &sds); | ||
4034 | |||
4035 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
4036 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
4037 | * at this level. | ||
4038 | * 2) There is no busy sibling group to pull from. | ||
4039 | * 3) This group is the busiest group. | ||
4040 | * 4) This group is more busy than the avg busieness at this | ||
4041 | * sched_domain. | ||
4042 | * 5) The imbalance is within the specified limit. | ||
4043 | * 6) Any rebalance would lead to ping-pong | ||
4044 | */ | ||
4045 | if (balance && !(*balance)) | ||
4046 | goto ret; | ||
4047 | |||
4048 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
4049 | goto out_balanced; | ||
4050 | |||
4051 | if (sds.this_load >= sds.max_load) | ||
4052 | goto out_balanced; | ||
4053 | |||
4054 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
4055 | |||
4056 | if (sds.this_load >= sds.avg_load) | ||
4057 | goto out_balanced; | ||
4058 | |||
4059 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
4060 | goto out_balanced; | ||
4061 | |||
4062 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
4063 | if (sds.group_imb) | ||
4064 | sds.busiest_load_per_task = | ||
4065 | min(sds.busiest_load_per_task, sds.avg_load); | ||
4066 | |||
4067 | /* | ||
4068 | * We're trying to get all the cpus to the average_load, so we don't | ||
4069 | * want to push ourselves above the average load, nor do we wish to | ||
4070 | * reduce the max loaded cpu below the average load, as either of these | ||
4071 | * actions would just result in more rebalancing later, and ping-pong | ||
4072 | * tasks around. Thus we look for the minimum possible imbalance. | ||
4073 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
4074 | * be counted as no imbalance for these purposes -- we can't fix that | ||
4075 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
4076 | * appear as very large values with unsigned longs. | ||
4077 | */ | ||
4078 | if (sds.max_load <= sds.busiest_load_per_task) | ||
4079 | goto out_balanced; | ||
4080 | |||
4081 | /* Looks like there is an imbalance. Compute it */ | ||
4082 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
4083 | return sds.busiest; | ||
4084 | |||
4085 | out_balanced: | ||
4086 | /* | ||
4087 | * There is no obvious imbalance. But check if we can do some balancing | ||
4088 | * to save power. | ||
4089 | */ | ||
4090 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4091 | return sds.busiest; | ||
4092 | ret: | ||
4093 | *imbalance = 0; | ||
4094 | return NULL; | ||
4095 | } | ||
4096 | |||
4097 | /* | ||
4098 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
4099 | */ | ||
4100 | static struct rq * | ||
4101 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
4102 | unsigned long imbalance, const struct cpumask *cpus) | ||
4103 | { | ||
4104 | struct rq *busiest = NULL, *rq; | ||
4105 | unsigned long max_load = 0; | ||
4106 | int i; | ||
4107 | |||
4108 | for_each_cpu(i, sched_group_cpus(group)) { | ||
4109 | unsigned long power = power_of(i); | ||
4110 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
4111 | unsigned long wl; | ||
4112 | |||
4113 | if (!cpumask_test_cpu(i, cpus)) | ||
4114 | continue; | ||
4115 | |||
4116 | rq = cpu_rq(i); | ||
4117 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; | ||
4118 | wl /= power; | ||
4119 | |||
4120 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
4121 | continue; | ||
4122 | |||
4123 | if (wl > max_load) { | ||
4124 | max_load = wl; | ||
4125 | busiest = rq; | ||
4126 | } | ||
4127 | } | ||
4128 | |||
4129 | return busiest; | ||
4130 | } | ||
4131 | |||
4132 | /* | ||
4133 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
4134 | * so long as it is large enough. | ||
4135 | */ | ||
4136 | #define MAX_PINNED_INTERVAL 512 | ||
4137 | |||
4138 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
4139 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
4140 | |||
4141 | /* | ||
4142 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4143 | * tasks if there is an imbalance. | ||
4144 | */ | ||
4145 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
4146 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
4147 | int *balance) | ||
4148 | { | ||
4149 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
4150 | struct sched_group *group; | ||
4151 | unsigned long imbalance; | ||
4152 | struct rq *busiest; | ||
4153 | unsigned long flags; | ||
4154 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4155 | |||
4156 | cpumask_setall(cpus); | ||
4157 | |||
4158 | /* | ||
4159 | * When power savings policy is enabled for the parent domain, idle | ||
4160 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4161 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
4162 | * portraying it as CPU_NOT_IDLE. | ||
4163 | */ | ||
4164 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
4165 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4166 | sd_idle = 1; | ||
4167 | |||
4168 | schedstat_inc(sd, lb_count[idle]); | ||
4169 | |||
4170 | redo: | ||
4171 | update_shares(sd); | ||
4172 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
4173 | cpus, balance); | ||
4174 | |||
4175 | if (*balance == 0) | ||
4176 | goto out_balanced; | ||
4177 | |||
4178 | if (!group) { | ||
4179 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
4180 | goto out_balanced; | ||
4181 | } | ||
4182 | |||
4183 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
4184 | if (!busiest) { | ||
4185 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
4186 | goto out_balanced; | ||
4187 | } | ||
4188 | |||
4189 | BUG_ON(busiest == this_rq); | ||
4190 | |||
4191 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
4192 | |||
4193 | ld_moved = 0; | ||
4194 | if (busiest->nr_running > 1) { | ||
4195 | /* | ||
4196 | * Attempt to move tasks. If find_busiest_group has found | ||
4197 | * an imbalance but busiest->nr_running <= 1, the group is | ||
4198 | * still unbalanced. ld_moved simply stays zero, so it is | ||
4199 | * correctly treated as an imbalance. | ||
4200 | */ | ||
4201 | local_irq_save(flags); | ||
4202 | double_rq_lock(this_rq, busiest); | ||
4203 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4204 | imbalance, sd, idle, &all_pinned); | ||
4205 | double_rq_unlock(this_rq, busiest); | ||
4206 | local_irq_restore(flags); | ||
4207 | |||
4208 | /* | ||
4209 | * some other cpu did the load balance for us. | ||
4210 | */ | ||
4211 | if (ld_moved && this_cpu != smp_processor_id()) | ||
4212 | resched_cpu(this_cpu); | ||
4213 | |||
4214 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
4215 | if (unlikely(all_pinned)) { | ||
4216 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4217 | if (!cpumask_empty(cpus)) | ||
4218 | goto redo; | ||
4219 | goto out_balanced; | ||
4220 | } | ||
4221 | } | ||
4222 | |||
4223 | if (!ld_moved) { | ||
4224 | schedstat_inc(sd, lb_failed[idle]); | ||
4225 | sd->nr_balance_failed++; | ||
4226 | |||
4227 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | ||
4228 | |||
4229 | spin_lock_irqsave(&busiest->lock, flags); | ||
4230 | |||
4231 | /* don't kick the migration_thread, if the curr | ||
4232 | * task on busiest cpu can't be moved to this_cpu | ||
4233 | */ | ||
4234 | if (!cpumask_test_cpu(this_cpu, | ||
4235 | &busiest->curr->cpus_allowed)) { | ||
4236 | spin_unlock_irqrestore(&busiest->lock, flags); | ||
4237 | all_pinned = 1; | ||
4238 | goto out_one_pinned; | ||
4239 | } | ||
4240 | |||
4241 | if (!busiest->active_balance) { | ||
4242 | busiest->active_balance = 1; | ||
4243 | busiest->push_cpu = this_cpu; | ||
4244 | active_balance = 1; | ||
4245 | } | ||
4246 | spin_unlock_irqrestore(&busiest->lock, flags); | ||
4247 | if (active_balance) | ||
4248 | wake_up_process(busiest->migration_thread); | ||
4249 | |||
4250 | /* | ||
4251 | * We've kicked active balancing, reset the failure | ||
4252 | * counter. | ||
4253 | */ | ||
4254 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
4255 | } | ||
4256 | } else | ||
4257 | sd->nr_balance_failed = 0; | ||
4258 | |||
4259 | if (likely(!active_balance)) { | ||
4260 | /* We were unbalanced, so reset the balancing interval */ | ||
4261 | sd->balance_interval = sd->min_interval; | ||
4262 | } else { | ||
4263 | /* | ||
4264 | * If we've begun active balancing, start to back off. This | ||
4265 | * case may not be covered by the all_pinned logic if there | ||
4266 | * is only 1 task on the busy runqueue (because we don't call | ||
4267 | * move_tasks). | ||
4268 | */ | ||
4269 | if (sd->balance_interval < sd->max_interval) | ||
4270 | sd->balance_interval *= 2; | ||
4271 | } | ||
4272 | |||
4273 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4274 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4275 | ld_moved = -1; | ||
4276 | |||
4277 | goto out; | ||
4278 | |||
4279 | out_balanced: | ||
4280 | schedstat_inc(sd, lb_balanced[idle]); | ||
4281 | |||
4282 | sd->nr_balance_failed = 0; | ||
4283 | |||
4284 | out_one_pinned: | ||
4285 | /* tune up the balancing interval */ | ||
4286 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
4287 | (sd->balance_interval < sd->max_interval)) | ||
4288 | sd->balance_interval *= 2; | ||
4289 | |||
4290 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4291 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4292 | ld_moved = -1; | ||
4293 | else | ||
4294 | ld_moved = 0; | ||
4295 | out: | ||
4296 | if (ld_moved) | ||
4297 | update_shares(sd); | ||
4298 | return ld_moved; | ||
4299 | } | ||
4300 | |||
4301 | /* | ||
4302 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4303 | * tasks if there is an imbalance. | ||
4304 | * | ||
4305 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | ||
4306 | * this_rq is locked. | ||
4307 | */ | ||
4308 | static int | ||
4309 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | ||
4310 | { | ||
4311 | struct sched_group *group; | ||
4312 | struct rq *busiest = NULL; | ||
4313 | unsigned long imbalance; | ||
4314 | int ld_moved = 0; | ||
4315 | int sd_idle = 0; | ||
4316 | int all_pinned = 0; | ||
4317 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4318 | |||
4319 | cpumask_setall(cpus); | ||
4320 | |||
4321 | /* | ||
4322 | * When power savings policy is enabled for the parent domain, idle | ||
4323 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4324 | * let the state of idle sibling percolate up as IDLE, instead of | ||
4325 | * portraying it as CPU_NOT_IDLE. | ||
4326 | */ | ||
4327 | if (sd->flags & SD_SHARE_CPUPOWER && | ||
4328 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4329 | sd_idle = 1; | ||
4330 | |||
4331 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | ||
4332 | redo: | ||
4333 | update_shares_locked(this_rq, sd); | ||
4334 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | ||
4335 | &sd_idle, cpus, NULL); | ||
4336 | if (!group) { | ||
4337 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | ||
4338 | goto out_balanced; | ||
4339 | } | ||
4340 | |||
4341 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); | ||
4342 | if (!busiest) { | ||
4343 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | ||
4344 | goto out_balanced; | ||
4345 | } | ||
4346 | |||
4347 | BUG_ON(busiest == this_rq); | ||
4348 | |||
4349 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | ||
4350 | |||
4351 | ld_moved = 0; | ||
4352 | if (busiest->nr_running > 1) { | ||
4353 | /* Attempt to move tasks */ | ||
4354 | double_lock_balance(this_rq, busiest); | ||
4355 | /* this_rq->clock is already updated */ | ||
4356 | update_rq_clock(busiest); | ||
4357 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4358 | imbalance, sd, CPU_NEWLY_IDLE, | ||
4359 | &all_pinned); | ||
4360 | double_unlock_balance(this_rq, busiest); | ||
4361 | |||
4362 | if (unlikely(all_pinned)) { | ||
4363 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4364 | if (!cpumask_empty(cpus)) | ||
4365 | goto redo; | ||
4366 | } | ||
4367 | } | ||
4368 | |||
4369 | if (!ld_moved) { | ||
4370 | int active_balance = 0; | ||
4371 | |||
4372 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | ||
4373 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4374 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4375 | return -1; | ||
4376 | |||
4377 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4378 | return -1; | ||
4379 | |||
4380 | if (sd->nr_balance_failed++ < 2) | ||
4381 | return -1; | ||
4382 | |||
4383 | /* | ||
4384 | * The only task running in a non-idle cpu can be moved to this | ||
4385 | * cpu in an attempt to completely freeup the other CPU | ||
4386 | * package. The same method used to move task in load_balance() | ||
4387 | * have been extended for load_balance_newidle() to speedup | ||
4388 | * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) | ||
4389 | * | ||
4390 | * The package power saving logic comes from | ||
4391 | * find_busiest_group(). If there are no imbalance, then | ||
4392 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4393 | * f_b_g() will select a group from which a running task may be | ||
4394 | * pulled to this cpu in order to make the other package idle. | ||
4395 | * If there is no opportunity to make a package idle and if | ||
4396 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4397 | * action will be taken in load_balance_newidle(). | ||
4398 | * | ||
4399 | * Under normal task pull operation due to imbalance, there | ||
4400 | * will be more than one task in the source run queue and | ||
4401 | * move_tasks() will succeed. ld_moved will be true and this | ||
4402 | * active balance code will not be triggered. | ||
4403 | */ | ||
4404 | |||
4405 | /* Lock busiest in correct order while this_rq is held */ | ||
4406 | double_lock_balance(this_rq, busiest); | ||
4407 | |||
4408 | /* | ||
4409 | * don't kick the migration_thread, if the curr | ||
4410 | * task on busiest cpu can't be moved to this_cpu | ||
4411 | */ | ||
4412 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | ||
4413 | double_unlock_balance(this_rq, busiest); | ||
4414 | all_pinned = 1; | ||
4415 | return ld_moved; | ||
4416 | } | ||
4417 | |||
4418 | if (!busiest->active_balance) { | ||
4419 | busiest->active_balance = 1; | ||
4420 | busiest->push_cpu = this_cpu; | ||
4421 | active_balance = 1; | ||
4422 | } | ||
4423 | |||
4424 | double_unlock_balance(this_rq, busiest); | ||
4425 | /* | ||
4426 | * Should not call ttwu while holding a rq->lock | ||
4427 | */ | ||
4428 | spin_unlock(&this_rq->lock); | ||
4429 | if (active_balance) | ||
4430 | wake_up_process(busiest->migration_thread); | ||
4431 | spin_lock(&this_rq->lock); | ||
4432 | |||
4433 | } else | ||
4434 | sd->nr_balance_failed = 0; | ||
4435 | |||
4436 | update_shares_locked(this_rq, sd); | ||
4437 | return ld_moved; | ||
4438 | |||
4439 | out_balanced: | ||
4440 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | ||
4441 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4442 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4443 | return -1; | ||
4444 | sd->nr_balance_failed = 0; | ||
4445 | |||
4446 | return 0; | ||
4447 | } | ||
4448 | |||
4449 | /* | ||
4450 | * idle_balance is called by schedule() if this_cpu is about to become | ||
4451 | * idle. Attempts to pull tasks from other CPUs. | ||
4452 | */ | ||
4453 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
4454 | { | ||
4455 | struct sched_domain *sd; | ||
4456 | int pulled_task = 0; | ||
4457 | unsigned long next_balance = jiffies + HZ; | ||
4458 | |||
4459 | for_each_domain(this_cpu, sd) { | ||
4460 | unsigned long interval; | ||
4461 | |||
4462 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4463 | continue; | ||
4464 | |||
4465 | if (sd->flags & SD_BALANCE_NEWIDLE) | ||
4466 | /* If we've pulled tasks over stop searching: */ | ||
4467 | pulled_task = load_balance_newidle(this_cpu, this_rq, | ||
4468 | sd); | ||
4469 | |||
4470 | interval = msecs_to_jiffies(sd->balance_interval); | ||
4471 | if (time_after(next_balance, sd->last_balance + interval)) | ||
4472 | next_balance = sd->last_balance + interval; | ||
4473 | if (pulled_task) | ||
4474 | break; | ||
4475 | } | ||
4476 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
4477 | /* | ||
4478 | * We are going idle. next_balance may be set based on | ||
4479 | * a busy processor. So reset next_balance. | ||
4480 | */ | ||
4481 | this_rq->next_balance = next_balance; | ||
4482 | } | ||
4483 | } | ||
4484 | |||
4485 | /* | ||
4486 | * active_load_balance is run by migration threads. It pushes running tasks | ||
4487 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
4488 | * running on each physical CPU where possible, and avoids physical / | ||
4489 | * logical imbalances. | ||
4490 | * | ||
4491 | * Called with busiest_rq locked. | ||
4492 | */ | ||
4493 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
4494 | { | ||
4495 | int target_cpu = busiest_rq->push_cpu; | ||
4496 | struct sched_domain *sd; | ||
4497 | struct rq *target_rq; | ||
4498 | |||
4499 | /* Is there any task to move? */ | ||
4500 | if (busiest_rq->nr_running <= 1) | ||
4501 | return; | ||
4502 | |||
4503 | target_rq = cpu_rq(target_cpu); | ||
4504 | |||
4505 | /* | ||
4506 | * This condition is "impossible", if it occurs | ||
4507 | * we need to fix it. Originally reported by | ||
4508 | * Bjorn Helgaas on a 128-cpu setup. | ||
4509 | */ | ||
4510 | BUG_ON(busiest_rq == target_rq); | ||
4511 | |||
4512 | /* move a task from busiest_rq to target_rq */ | ||
4513 | double_lock_balance(busiest_rq, target_rq); | ||
4514 | update_rq_clock(busiest_rq); | ||
4515 | update_rq_clock(target_rq); | ||
4516 | |||
4517 | /* Search for an sd spanning us and the target CPU. */ | ||
4518 | for_each_domain(target_cpu, sd) { | ||
4519 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
4520 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
4521 | break; | ||
4522 | } | ||
4523 | |||
4524 | if (likely(sd)) { | ||
4525 | schedstat_inc(sd, alb_count); | ||
4526 | |||
4527 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
4528 | sd, CPU_IDLE)) | ||
4529 | schedstat_inc(sd, alb_pushed); | ||
4530 | else | ||
4531 | schedstat_inc(sd, alb_failed); | ||
4532 | } | ||
4533 | double_unlock_balance(busiest_rq, target_rq); | ||
4534 | } | ||
4535 | |||
4536 | #ifdef CONFIG_NO_HZ | ||
4537 | static struct { | ||
4538 | atomic_t load_balancer; | ||
4539 | cpumask_var_t cpu_mask; | ||
4540 | cpumask_var_t ilb_grp_nohz_mask; | ||
4541 | } nohz ____cacheline_aligned = { | ||
4542 | .load_balancer = ATOMIC_INIT(-1), | ||
4543 | }; | ||
4544 | |||
4545 | int get_nohz_load_balancer(void) | ||
4546 | { | ||
4547 | return atomic_read(&nohz.load_balancer); | ||
4548 | } | ||
4549 | |||
4550 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4551 | /** | ||
4552 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4553 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4554 | * be returned. | ||
4555 | * @flag: The flag to check for the lowest sched_domain | ||
4556 | * for the given cpu. | ||
4557 | * | ||
4558 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4559 | */ | ||
4560 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4561 | { | ||
4562 | struct sched_domain *sd; | ||
4563 | |||
4564 | for_each_domain(cpu, sd) | ||
4565 | if (sd && (sd->flags & flag)) | ||
4566 | break; | ||
4567 | |||
4568 | return sd; | ||
4569 | } | ||
4570 | |||
4571 | /** | ||
4572 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4573 | * @cpu: The cpu whose domains we're iterating over. | ||
4574 | * @sd: variable holding the value of the power_savings_sd | ||
4575 | * for cpu. | ||
4576 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4577 | * | ||
4578 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4579 | * set, starting from the lowest sched_domain to the highest. | ||
4580 | */ | ||
4581 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4582 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4583 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4584 | |||
4585 | /** | ||
4586 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4587 | * @ilb_group: group to be checked for semi-idleness | ||
4588 | * | ||
4589 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4590 | * | ||
4591 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4592 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4593 | * sched_group is semi-idle or not. | ||
4594 | */ | ||
4595 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4596 | { | ||
4597 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4598 | sched_group_cpus(ilb_group)); | ||
4599 | |||
4600 | /* | ||
4601 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4602 | * and atleast one idle cpu. | ||
4603 | */ | ||
4604 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4605 | return 0; | ||
4606 | |||
4607 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4608 | return 0; | ||
4609 | |||
4610 | return 1; | ||
4611 | } | ||
4612 | /** | ||
4613 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4614 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4615 | * | ||
4616 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4617 | * Else, returns >= nr_cpu_ids. | ||
4618 | * | ||
4619 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4620 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4621 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4622 | * when there are other idle cpu's which are better suited for that job. | ||
4623 | */ | ||
4624 | static int find_new_ilb(int cpu) | ||
4625 | { | ||
4626 | struct sched_domain *sd; | ||
4627 | struct sched_group *ilb_group; | ||
4628 | |||
4629 | /* | ||
4630 | * Have idle load balancer selection from semi-idle packages only | ||
4631 | * when power-aware load balancing is enabled | ||
4632 | */ | ||
4633 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4634 | goto out_done; | ||
4635 | |||
4636 | /* | ||
4637 | * Optimize for the case when we have no idle CPUs or only one | ||
4638 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4639 | */ | ||
4640 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4641 | goto out_done; | ||
4642 | |||
4643 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4644 | ilb_group = sd->groups; | ||
4645 | |||
4646 | do { | ||
4647 | if (is_semi_idle_group(ilb_group)) | ||
4648 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4649 | |||
4650 | ilb_group = ilb_group->next; | ||
4651 | |||
4652 | } while (ilb_group != sd->groups); | ||
4653 | } | ||
4654 | |||
4655 | out_done: | ||
4656 | return cpumask_first(nohz.cpu_mask); | ||
4657 | } | ||
4658 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4659 | static inline int find_new_ilb(int call_cpu) | ||
4660 | { | ||
4661 | return cpumask_first(nohz.cpu_mask); | ||
4662 | } | ||
4663 | #endif | ||
4664 | |||
4665 | /* | ||
4666 | * This routine will try to nominate the ilb (idle load balancing) | ||
4667 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
4668 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
4669 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
4670 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
4671 | * arrives... | ||
4672 | * | ||
4673 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
4674 | * for idle load balancing. ilb owner will still be part of | ||
4675 | * nohz.cpu_mask.. | ||
4676 | * | ||
4677 | * While stopping the tick, this cpu will become the ilb owner if there | ||
4678 | * is no other owner. And will be the owner till that cpu becomes busy | ||
4679 | * or if all cpus in the system stop their ticks at which point | ||
4680 | * there is no need for ilb owner. | ||
4681 | * | ||
4682 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
4683 | * next busy scheduler_tick() | ||
4684 | */ | ||
4685 | int select_nohz_load_balancer(int stop_tick) | ||
4686 | { | ||
4687 | int cpu = smp_processor_id(); | ||
4688 | |||
4689 | if (stop_tick) { | ||
4690 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
4691 | |||
4692 | if (!cpu_active(cpu)) { | ||
4693 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
4694 | return 0; | ||
4695 | |||
4696 | /* | ||
4697 | * If we are going offline and still the leader, | ||
4698 | * give up! | ||
4699 | */ | ||
4700 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4701 | BUG(); | ||
4702 | |||
4703 | return 0; | ||
4704 | } | ||
4705 | |||
4706 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
4707 | |||
4708 | /* time for ilb owner also to sleep */ | ||
4709 | if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
4710 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4711 | atomic_set(&nohz.load_balancer, -1); | ||
4712 | return 0; | ||
4713 | } | ||
4714 | |||
4715 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4716 | /* make me the ilb owner */ | ||
4717 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
4718 | return 1; | ||
4719 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4720 | int new_ilb; | ||
4721 | |||
4722 | if (!(sched_smt_power_savings || | ||
4723 | sched_mc_power_savings)) | ||
4724 | return 1; | ||
4725 | /* | ||
4726 | * Check to see if there is a more power-efficient | ||
4727 | * ilb. | ||
4728 | */ | ||
4729 | new_ilb = find_new_ilb(cpu); | ||
4730 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4731 | atomic_set(&nohz.load_balancer, -1); | ||
4732 | resched_cpu(new_ilb); | ||
4733 | return 0; | ||
4734 | } | ||
4735 | return 1; | ||
4736 | } | ||
4737 | } else { | ||
4738 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4739 | return 0; | ||
4740 | |||
4741 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4742 | |||
4743 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4744 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4745 | BUG(); | ||
4746 | } | ||
4747 | return 0; | ||
4748 | } | ||
4749 | #endif | ||
4750 | |||
4751 | static DEFINE_SPINLOCK(balancing); | ||
4752 | |||
4753 | /* | ||
4754 | * It checks each scheduling domain to see if it is due to be balanced, | ||
4755 | * and initiates a balancing operation if so. | ||
4756 | * | ||
4757 | * Balancing parameters are set up in arch_init_sched_domains. | ||
4758 | */ | ||
4759 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
4760 | { | ||
4761 | int balance = 1; | ||
4762 | struct rq *rq = cpu_rq(cpu); | ||
4763 | unsigned long interval; | ||
4764 | struct sched_domain *sd; | ||
4765 | /* Earliest time when we have to do rebalance again */ | ||
4766 | unsigned long next_balance = jiffies + 60*HZ; | ||
4767 | int update_next_balance = 0; | ||
4768 | int need_serialize; | ||
4769 | |||
4770 | for_each_domain(cpu, sd) { | ||
4771 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4772 | continue; | ||
4773 | |||
4774 | interval = sd->balance_interval; | ||
4775 | if (idle != CPU_IDLE) | ||
4776 | interval *= sd->busy_factor; | ||
4777 | |||
4778 | /* scale ms to jiffies */ | ||
4779 | interval = msecs_to_jiffies(interval); | ||
4780 | if (unlikely(!interval)) | ||
4781 | interval = 1; | ||
4782 | if (interval > HZ*NR_CPUS/10) | ||
4783 | interval = HZ*NR_CPUS/10; | ||
4784 | |||
4785 | need_serialize = sd->flags & SD_SERIALIZE; | ||
4786 | |||
4787 | if (need_serialize) { | ||
4788 | if (!spin_trylock(&balancing)) | ||
4789 | goto out; | ||
4790 | } | ||
4791 | |||
4792 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
4793 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
4794 | /* | ||
4795 | * We've pulled tasks over so either we're no | ||
4796 | * longer idle, or one of our SMT siblings is | ||
4797 | * not idle. | ||
4798 | */ | ||
4799 | idle = CPU_NOT_IDLE; | ||
4800 | } | ||
4801 | sd->last_balance = jiffies; | ||
4802 | } | ||
4803 | if (need_serialize) | ||
4804 | spin_unlock(&balancing); | ||
4805 | out: | ||
4806 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
4807 | next_balance = sd->last_balance + interval; | ||
4808 | update_next_balance = 1; | ||
4809 | } | ||
4810 | |||
4811 | /* | ||
4812 | * Stop the load balance at this level. There is another | ||
4813 | * CPU in our sched group which is doing load balancing more | ||
4814 | * actively. | ||
4815 | */ | ||
4816 | if (!balance) | ||
4817 | break; | ||
4818 | } | ||
4819 | |||
4820 | /* | ||
4821 | * next_balance will be updated only when there is a need. | ||
4822 | * When the cpu is attached to null domain for ex, it will not be | ||
4823 | * updated. | ||
4824 | */ | ||
4825 | if (likely(update_next_balance)) | ||
4826 | rq->next_balance = next_balance; | ||
4827 | } | ||
4828 | |||
4829 | /* | ||
4830 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
4831 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
4832 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
4833 | */ | ||
4834 | static void run_rebalance_domains(struct softirq_action *h) | ||
4835 | { | ||
4836 | int this_cpu = smp_processor_id(); | ||
4837 | struct rq *this_rq = cpu_rq(this_cpu); | ||
4838 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
4839 | CPU_IDLE : CPU_NOT_IDLE; | ||
4840 | |||
4841 | rebalance_domains(this_cpu, idle); | ||
4842 | |||
4843 | #ifdef CONFIG_NO_HZ | ||
4844 | /* | ||
4845 | * If this cpu is the owner for idle load balancing, then do the | ||
4846 | * balancing on behalf of the other idle cpus whose ticks are | ||
4847 | * stopped. | ||
4848 | */ | ||
4849 | if (this_rq->idle_at_tick && | ||
4850 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
4851 | struct rq *rq; | ||
4852 | int balance_cpu; | ||
4853 | |||
4854 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
4855 | if (balance_cpu == this_cpu) | ||
4856 | continue; | ||
4857 | |||
4858 | /* | ||
4859 | * If this cpu gets work to do, stop the load balancing | ||
4860 | * work being done for other cpus. Next load | ||
4861 | * balancing owner will pick it up. | ||
4862 | */ | ||
4863 | if (need_resched()) | ||
4864 | break; | ||
4865 | |||
4866 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
4867 | |||
4868 | rq = cpu_rq(balance_cpu); | ||
4869 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
4870 | this_rq->next_balance = rq->next_balance; | ||
4871 | } | ||
4872 | } | ||
4873 | #endif | ||
4874 | } | ||
4875 | |||
4876 | static inline int on_null_domain(int cpu) | ||
4877 | { | ||
4878 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
4879 | } | ||
4880 | |||
4881 | /* | ||
4882 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
4883 | * | ||
4884 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
4885 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
4886 | * if the whole system is idle. | ||
4887 | */ | ||
4888 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
4889 | { | ||
4890 | #ifdef CONFIG_NO_HZ | ||
4891 | /* | ||
4892 | * If we were in the nohz mode recently and busy at the current | ||
4893 | * scheduler tick, then check if we need to nominate new idle | ||
4894 | * load balancer. | ||
4895 | */ | ||
4896 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
4897 | rq->in_nohz_recently = 0; | ||
4898 | |||
4899 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4900 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4901 | atomic_set(&nohz.load_balancer, -1); | ||
4902 | } | ||
4903 | |||
4904 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4905 | int ilb = find_new_ilb(cpu); | ||
4906 | |||
4907 | if (ilb < nr_cpu_ids) | ||
4908 | resched_cpu(ilb); | ||
4909 | } | ||
4910 | } | ||
4911 | |||
4912 | /* | ||
4913 | * If this cpu is idle and doing idle load balancing for all the | ||
4914 | * cpus with ticks stopped, is it time for that to stop? | ||
4915 | */ | ||
4916 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
4917 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
4918 | resched_cpu(cpu); | ||
4919 | return; | ||
4920 | } | ||
4921 | |||
4922 | /* | ||
4923 | * If this cpu is idle and the idle load balancing is done by | ||
4924 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
4925 | */ | ||
4926 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
4927 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4928 | return; | ||
4929 | #endif | ||
4930 | /* Don't need to rebalance while attached to NULL domain */ | ||
4931 | if (time_after_eq(jiffies, rq->next_balance) && | ||
4932 | likely(!on_null_domain(cpu))) | ||
4933 | raise_softirq(SCHED_SOFTIRQ); | ||
4934 | } | ||
4935 | |||
4936 | #else /* CONFIG_SMP */ | ||
4937 | |||
4938 | /* | ||
4939 | * on UP we do not need to balance between CPUs: | ||
4940 | */ | ||
4941 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4942 | { | ||
4943 | } | ||
4944 | |||
4945 | #endif | 3202 | #endif |
4946 | 3203 | ||
4947 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3204 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -5073,8 +3330,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
5073 | p->gtime = cputime_add(p->gtime, cputime); | 3330 | p->gtime = cputime_add(p->gtime, cputime); |
5074 | 3331 | ||
5075 | /* Add guest time to cpustat. */ | 3332 | /* Add guest time to cpustat. */ |
5076 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3333 | if (TASK_NICE(p) > 0) { |
5077 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 3334 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3335 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | ||
3336 | } else { | ||
3337 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3338 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
3339 | } | ||
5078 | } | 3340 | } |
5079 | 3341 | ||
5080 | /* | 3342 | /* |
@@ -5189,60 +3451,86 @@ void account_idle_ticks(unsigned long ticks) | |||
5189 | * Use precise platform statistics if available: | 3451 | * Use precise platform statistics if available: |
5190 | */ | 3452 | */ |
5191 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 3453 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
5192 | cputime_t task_utime(struct task_struct *p) | 3454 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
5193 | { | 3455 | { |
5194 | return p->utime; | 3456 | *ut = p->utime; |
3457 | *st = p->stime; | ||
5195 | } | 3458 | } |
5196 | 3459 | ||
5197 | cputime_t task_stime(struct task_struct *p) | 3460 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
5198 | { | 3461 | { |
5199 | return p->stime; | 3462 | struct task_cputime cputime; |
3463 | |||
3464 | thread_group_cputime(p, &cputime); | ||
3465 | |||
3466 | *ut = cputime.utime; | ||
3467 | *st = cputime.stime; | ||
5200 | } | 3468 | } |
5201 | #else | 3469 | #else |
5202 | cputime_t task_utime(struct task_struct *p) | 3470 | |
3471 | #ifndef nsecs_to_cputime | ||
3472 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
3473 | #endif | ||
3474 | |||
3475 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
5203 | { | 3476 | { |
5204 | clock_t utime = cputime_to_clock_t(p->utime), | 3477 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); |
5205 | total = utime + cputime_to_clock_t(p->stime); | ||
5206 | u64 temp; | ||
5207 | 3478 | ||
5208 | /* | 3479 | /* |
5209 | * Use CFS's precise accounting: | 3480 | * Use CFS's precise accounting: |
5210 | */ | 3481 | */ |
5211 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | 3482 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
5212 | 3483 | ||
5213 | if (total) { | 3484 | if (total) { |
5214 | temp *= utime; | 3485 | u64 temp; |
3486 | |||
3487 | temp = (u64)(rtime * utime); | ||
5215 | do_div(temp, total); | 3488 | do_div(temp, total); |
5216 | } | 3489 | utime = (cputime_t)temp; |
5217 | utime = (clock_t)temp; | 3490 | } else |
3491 | utime = rtime; | ||
3492 | |||
3493 | /* | ||
3494 | * Compare with previous values, to keep monotonicity: | ||
3495 | */ | ||
3496 | p->prev_utime = max(p->prev_utime, utime); | ||
3497 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | ||
5218 | 3498 | ||
5219 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | 3499 | *ut = p->prev_utime; |
5220 | return p->prev_utime; | 3500 | *st = p->prev_stime; |
5221 | } | 3501 | } |
5222 | 3502 | ||
5223 | cputime_t task_stime(struct task_struct *p) | 3503 | /* |
3504 | * Must be called with siglock held. | ||
3505 | */ | ||
3506 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
5224 | { | 3507 | { |
5225 | clock_t stime; | 3508 | struct signal_struct *sig = p->signal; |
3509 | struct task_cputime cputime; | ||
3510 | cputime_t rtime, utime, total; | ||
5226 | 3511 | ||
5227 | /* | 3512 | thread_group_cputime(p, &cputime); |
5228 | * Use CFS's precise accounting. (we subtract utime from | ||
5229 | * the total, to make sure the total observed by userspace | ||
5230 | * grows monotonically - apps rely on that): | ||
5231 | */ | ||
5232 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
5233 | cputime_to_clock_t(task_utime(p)); | ||
5234 | 3513 | ||
5235 | if (stime >= 0) | 3514 | total = cputime_add(cputime.utime, cputime.stime); |
5236 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | 3515 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
5237 | 3516 | ||
5238 | return p->prev_stime; | 3517 | if (total) { |
5239 | } | 3518 | u64 temp; |
5240 | #endif | ||
5241 | 3519 | ||
5242 | inline cputime_t task_gtime(struct task_struct *p) | 3520 | temp = (u64)(rtime * cputime.utime); |
5243 | { | 3521 | do_div(temp, total); |
5244 | return p->gtime; | 3522 | utime = (cputime_t)temp; |
3523 | } else | ||
3524 | utime = rtime; | ||
3525 | |||
3526 | sig->prev_utime = max(sig->prev_utime, utime); | ||
3527 | sig->prev_stime = max(sig->prev_stime, | ||
3528 | cputime_sub(rtime, sig->prev_utime)); | ||
3529 | |||
3530 | *ut = sig->prev_utime; | ||
3531 | *st = sig->prev_stime; | ||
5245 | } | 3532 | } |
3533 | #endif | ||
5246 | 3534 | ||
5247 | /* | 3535 | /* |
5248 | * This function gets called by the timer code, with HZ frequency. | 3536 | * This function gets called by the timer code, with HZ frequency. |
@@ -5261,7 +3549,7 @@ void scheduler_tick(void) | |||
5261 | 3549 | ||
5262 | TS_TICK_START(current); | 3550 | TS_TICK_START(current); |
5263 | 3551 | ||
5264 | spin_lock(&rq->lock); | 3552 | raw_spin_lock(&rq->lock); |
5265 | update_rq_clock(rq); | 3553 | update_rq_clock(rq); |
5266 | update_cpu_load(rq); | 3554 | update_cpu_load(rq); |
5267 | curr->sched_class->task_tick(rq, curr, 0); | 3555 | curr->sched_class->task_tick(rq, curr, 0); |
@@ -5269,9 +3557,9 @@ void scheduler_tick(void) | |||
5269 | /* litmus_tick may force current to resched */ | 3557 | /* litmus_tick may force current to resched */ |
5270 | litmus_tick(rq, curr); | 3558 | litmus_tick(rq, curr); |
5271 | 3559 | ||
5272 | spin_unlock(&rq->lock); | 3560 | raw_spin_unlock(&rq->lock); |
5273 | 3561 | ||
5274 | perf_event_task_tick(curr, cpu); | 3562 | perf_event_task_tick(curr); |
5275 | 3563 | ||
5276 | #ifdef CONFIG_SMP | 3564 | #ifdef CONFIG_SMP |
5277 | rq->idle_at_tick = idle_cpu(cpu); | 3565 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -5385,13 +3673,14 @@ static inline void schedule_debug(struct task_struct *prev) | |||
5385 | #endif | 3673 | #endif |
5386 | } | 3674 | } |
5387 | 3675 | ||
5388 | static void put_prev_task(struct rq *rq, struct task_struct *p) | 3676 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
5389 | { | 3677 | { |
5390 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; | 3678 | if (prev->state == TASK_RUNNING) { |
3679 | u64 runtime = prev->se.sum_exec_runtime; | ||
5391 | 3680 | ||
5392 | update_avg(&p->se.avg_running, runtime); | 3681 | runtime -= prev->se.prev_sum_exec_runtime; |
3682 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
5393 | 3683 | ||
5394 | if (p->state == TASK_RUNNING) { | ||
5395 | /* | 3684 | /* |
5396 | * In order to avoid avg_overlap growing stale when we are | 3685 | * In order to avoid avg_overlap growing stale when we are |
5397 | * indeed overlapping and hence not getting put to sleep, grow | 3686 | * indeed overlapping and hence not getting put to sleep, grow |
@@ -5401,12 +3690,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) | |||
5401 | * correlates to the amount of cache footprint a task can | 3690 | * correlates to the amount of cache footprint a task can |
5402 | * build up. | 3691 | * build up. |
5403 | */ | 3692 | */ |
5404 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | 3693 | update_avg(&prev->se.avg_overlap, runtime); |
5405 | update_avg(&p->se.avg_overlap, runtime); | ||
5406 | } else { | ||
5407 | update_avg(&p->se.avg_running, 0); | ||
5408 | } | 3694 | } |
5409 | p->sched_class->put_prev_task(rq, p); | 3695 | prev->sched_class->put_prev_task(rq, prev); |
5410 | } | 3696 | } |
5411 | 3697 | ||
5412 | /* | 3698 | /* |
@@ -5477,7 +3763,7 @@ need_resched_nonpreemptible: | |||
5477 | if (sched_feat(HRTICK)) | 3763 | if (sched_feat(HRTICK)) |
5478 | hrtick_clear(rq); | 3764 | hrtick_clear(rq); |
5479 | 3765 | ||
5480 | spin_lock_irq(&rq->lock); | 3766 | raw_spin_lock_irq(&rq->lock); |
5481 | update_rq_clock(rq); | 3767 | update_rq_clock(rq); |
5482 | clear_tsk_need_resched(prev); | 3768 | clear_tsk_need_resched(prev); |
5483 | 3769 | ||
@@ -5499,7 +3785,7 @@ need_resched_nonpreemptible: | |||
5499 | 3785 | ||
5500 | if (likely(prev != next)) { | 3786 | if (likely(prev != next)) { |
5501 | sched_info_switch(prev, next); | 3787 | sched_info_switch(prev, next); |
5502 | perf_event_task_sched_out(prev, next, cpu); | 3788 | perf_event_task_sched_out(prev, next); |
5503 | 3789 | ||
5504 | rq->nr_switches++; | 3790 | rq->nr_switches++; |
5505 | rq->curr = next; | 3791 | rq->curr = next; |
@@ -5517,7 +3803,7 @@ need_resched_nonpreemptible: | |||
5517 | rq = cpu_rq(cpu); | 3803 | rq = cpu_rq(cpu); |
5518 | } else { | 3804 | } else { |
5519 | TS_SCHED_END(prev); | 3805 | TS_SCHED_END(prev); |
5520 | spin_unlock_irq(&rq->lock); | 3806 | raw_spin_unlock_irq(&rq->lock); |
5521 | } | 3807 | } |
5522 | 3808 | ||
5523 | sched_trace_task_switch_to(current); | 3809 | sched_trace_task_switch_to(current); |
@@ -5525,11 +3811,12 @@ need_resched_nonpreemptible: | |||
5525 | post_schedule(rq); | 3811 | post_schedule(rq); |
5526 | 3812 | ||
5527 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3813 | if (unlikely(reacquire_kernel_lock(current) < 0)) { |
3814 | prev = rq->curr; | ||
3815 | switch_count = &prev->nivcsw; | ||
5528 | goto need_resched_nonpreemptible; | 3816 | goto need_resched_nonpreemptible; |
5529 | } | 3817 | } |
5530 | 3818 | ||
5531 | preempt_enable_no_resched(); | 3819 | preempt_enable_no_resched(); |
5532 | |||
5533 | if (need_resched()) | 3820 | if (need_resched()) |
5534 | goto need_resched; | 3821 | goto need_resched; |
5535 | 3822 | ||
@@ -5538,7 +3825,7 @@ need_resched_nonpreemptible: | |||
5538 | } | 3825 | } |
5539 | EXPORT_SYMBOL(schedule); | 3826 | EXPORT_SYMBOL(schedule); |
5540 | 3827 | ||
5541 | #ifdef CONFIG_SMP | 3828 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
5542 | /* | 3829 | /* |
5543 | * Look out! "owner" is an entirely speculative pointer | 3830 | * Look out! "owner" is an entirely speculative pointer |
5544 | * access and not reliable. | 3831 | * access and not reliable. |
@@ -5558,7 +3845,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5558 | * the mutex owner just released it and exited. | 3845 | * the mutex owner just released it and exited. |
5559 | */ | 3846 | */ |
5560 | if (probe_kernel_address(&owner->cpu, cpu)) | 3847 | if (probe_kernel_address(&owner->cpu, cpu)) |
5561 | goto out; | 3848 | return 0; |
5562 | #else | 3849 | #else |
5563 | cpu = owner->cpu; | 3850 | cpu = owner->cpu; |
5564 | #endif | 3851 | #endif |
@@ -5568,14 +3855,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5568 | * the cpu field may no longer be valid. | 3855 | * the cpu field may no longer be valid. |
5569 | */ | 3856 | */ |
5570 | if (cpu >= nr_cpumask_bits) | 3857 | if (cpu >= nr_cpumask_bits) |
5571 | goto out; | 3858 | return 0; |
5572 | 3859 | ||
5573 | /* | 3860 | /* |
5574 | * We need to validate that we can do a | 3861 | * We need to validate that we can do a |
5575 | * get_cpu() and that we have the percpu area. | 3862 | * get_cpu() and that we have the percpu area. |
5576 | */ | 3863 | */ |
5577 | if (!cpu_online(cpu)) | 3864 | if (!cpu_online(cpu)) |
5578 | goto out; | 3865 | return 0; |
5579 | 3866 | ||
5580 | rq = cpu_rq(cpu); | 3867 | rq = cpu_rq(cpu); |
5581 | 3868 | ||
@@ -5594,7 +3881,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5594 | 3881 | ||
5595 | cpu_relax(); | 3882 | cpu_relax(); |
5596 | } | 3883 | } |
5597 | out: | 3884 | |
5598 | return 1; | 3885 | return 1; |
5599 | } | 3886 | } |
5600 | #endif | 3887 | #endif |
@@ -5953,14 +4240,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
5953 | */ | 4240 | */ |
5954 | bool try_wait_for_completion(struct completion *x) | 4241 | bool try_wait_for_completion(struct completion *x) |
5955 | { | 4242 | { |
4243 | unsigned long flags; | ||
5956 | int ret = 1; | 4244 | int ret = 1; |
5957 | 4245 | ||
5958 | spin_lock_irq(&x->wait.lock); | 4246 | spin_lock_irqsave(&x->wait.lock, flags); |
5959 | if (!x->done) | 4247 | if (!x->done) |
5960 | ret = 0; | 4248 | ret = 0; |
5961 | else | 4249 | else |
5962 | x->done--; | 4250 | x->done--; |
5963 | spin_unlock_irq(&x->wait.lock); | 4251 | spin_unlock_irqrestore(&x->wait.lock, flags); |
5964 | return ret; | 4252 | return ret; |
5965 | } | 4253 | } |
5966 | EXPORT_SYMBOL(try_wait_for_completion); | 4254 | EXPORT_SYMBOL(try_wait_for_completion); |
@@ -5975,12 +4263,13 @@ EXPORT_SYMBOL(try_wait_for_completion); | |||
5975 | */ | 4263 | */ |
5976 | bool completion_done(struct completion *x) | 4264 | bool completion_done(struct completion *x) |
5977 | { | 4265 | { |
4266 | unsigned long flags; | ||
5978 | int ret = 1; | 4267 | int ret = 1; |
5979 | 4268 | ||
5980 | spin_lock_irq(&x->wait.lock); | 4269 | spin_lock_irqsave(&x->wait.lock, flags); |
5981 | if (!x->done) | 4270 | if (!x->done) |
5982 | ret = 0; | 4271 | ret = 0; |
5983 | spin_unlock_irq(&x->wait.lock); | 4272 | spin_unlock_irqrestore(&x->wait.lock, flags); |
5984 | return ret; | 4273 | return ret; |
5985 | } | 4274 | } |
5986 | EXPORT_SYMBOL(completion_done); | 4275 | EXPORT_SYMBOL(completion_done); |
@@ -6048,7 +4337,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6048 | unsigned long flags; | 4337 | unsigned long flags; |
6049 | int oldprio, on_rq, running; | 4338 | int oldprio, on_rq, running; |
6050 | struct rq *rq; | 4339 | struct rq *rq; |
6051 | const struct sched_class *prev_class = p->sched_class; | 4340 | const struct sched_class *prev_class; |
6052 | 4341 | ||
6053 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4342 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
6054 | 4343 | ||
@@ -6056,6 +4345,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6056 | update_rq_clock(rq); | 4345 | update_rq_clock(rq); |
6057 | 4346 | ||
6058 | oldprio = p->prio; | 4347 | oldprio = p->prio; |
4348 | prev_class = p->sched_class; | ||
6059 | on_rq = p->se.on_rq; | 4349 | on_rq = p->se.on_rq; |
6060 | running = task_current(rq, p); | 4350 | running = task_current(rq, p); |
6061 | if (on_rq) | 4351 | if (on_rq) |
@@ -6073,7 +4363,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6073 | if (running) | 4363 | if (running) |
6074 | p->sched_class->set_curr_task(rq); | 4364 | p->sched_class->set_curr_task(rq); |
6075 | if (on_rq) { | 4365 | if (on_rq) { |
6076 | enqueue_task(rq, p, 0); | 4366 | enqueue_task(rq, p, 0, oldprio < prio); |
6077 | 4367 | ||
6078 | check_class_changed(rq, p, prev_class, oldprio, running); | 4368 | check_class_changed(rq, p, prev_class, oldprio, running); |
6079 | } | 4369 | } |
@@ -6117,7 +4407,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
6117 | delta = p->prio - old_prio; | 4407 | delta = p->prio - old_prio; |
6118 | 4408 | ||
6119 | if (on_rq) { | 4409 | if (on_rq) { |
6120 | enqueue_task(rq, p, 0); | 4410 | enqueue_task(rq, p, 0, false); |
6121 | /* | 4411 | /* |
6122 | * If the task increased its priority or is running and | 4412 | * If the task increased its priority or is running and |
6123 | * lowered its priority, then reschedule its CPU: | 4413 | * lowered its priority, then reschedule its CPU: |
@@ -6140,7 +4430,7 @@ int can_nice(const struct task_struct *p, const int nice) | |||
6140 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4430 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
6141 | int nice_rlim = 20 - nice; | 4431 | int nice_rlim = 20 - nice; |
6142 | 4432 | ||
6143 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 4433 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
6144 | capable(CAP_SYS_NICE)); | 4434 | capable(CAP_SYS_NICE)); |
6145 | } | 4435 | } |
6146 | 4436 | ||
@@ -6243,25 +4533,16 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
6243 | BUG_ON(p->se.on_rq); | 4533 | BUG_ON(p->se.on_rq); |
6244 | 4534 | ||
6245 | p->policy = policy; | 4535 | p->policy = policy; |
6246 | switch (p->policy) { | ||
6247 | case SCHED_NORMAL: | ||
6248 | case SCHED_BATCH: | ||
6249 | case SCHED_IDLE: | ||
6250 | p->sched_class = &fair_sched_class; | ||
6251 | break; | ||
6252 | case SCHED_FIFO: | ||
6253 | case SCHED_RR: | ||
6254 | p->sched_class = &rt_sched_class; | ||
6255 | break; | ||
6256 | case SCHED_LITMUS: | ||
6257 | p->sched_class = &litmus_sched_class; | ||
6258 | break; | ||
6259 | } | ||
6260 | |||
6261 | p->rt_priority = prio; | 4536 | p->rt_priority = prio; |
6262 | p->normal_prio = normal_prio(p); | 4537 | p->normal_prio = normal_prio(p); |
6263 | /* we are holding p->pi_lock already */ | 4538 | /* we are holding p->pi_lock already */ |
6264 | p->prio = rt_mutex_getprio(p); | 4539 | p->prio = rt_mutex_getprio(p); |
4540 | if (p->policy == SCHED_LITMUS) | ||
4541 | p->sched_class = &litmus_sched_class; | ||
4542 | else if (rt_prio(p->prio)) | ||
4543 | p->sched_class = &rt_sched_class; | ||
4544 | else | ||
4545 | p->sched_class = &fair_sched_class; | ||
6265 | set_load_weight(p); | 4546 | set_load_weight(p); |
6266 | } | 4547 | } |
6267 | 4548 | ||
@@ -6286,7 +4567,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6286 | { | 4567 | { |
6287 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4568 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
6288 | unsigned long flags; | 4569 | unsigned long flags; |
6289 | const struct sched_class *prev_class = p->sched_class; | 4570 | const struct sched_class *prev_class; |
6290 | struct rq *rq; | 4571 | struct rq *rq; |
6291 | int reset_on_fork; | 4572 | int reset_on_fork; |
6292 | 4573 | ||
@@ -6330,7 +4611,7 @@ recheck: | |||
6330 | 4611 | ||
6331 | if (!lock_task_sighand(p, &flags)) | 4612 | if (!lock_task_sighand(p, &flags)) |
6332 | return -ESRCH; | 4613 | return -ESRCH; |
6333 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4614 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); |
6334 | unlock_task_sighand(p, &flags); | 4615 | unlock_task_sighand(p, &flags); |
6335 | 4616 | ||
6336 | /* can't set/change the rt policy */ | 4617 | /* can't set/change the rt policy */ |
@@ -6384,7 +4665,7 @@ recheck: | |||
6384 | * make sure no PI-waiters arrive (or leave) while we are | 4665 | * make sure no PI-waiters arrive (or leave) while we are |
6385 | * changing the priority of the task: | 4666 | * changing the priority of the task: |
6386 | */ | 4667 | */ |
6387 | spin_lock_irqsave(&p->pi_lock, flags); | 4668 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
6388 | /* | 4669 | /* |
6389 | * To be able to change p->policy safely, the apropriate | 4670 | * To be able to change p->policy safely, the apropriate |
6390 | * runqueue lock must be held. | 4671 | * runqueue lock must be held. |
@@ -6394,7 +4675,7 @@ recheck: | |||
6394 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4675 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
6395 | policy = oldpolicy = -1; | 4676 | policy = oldpolicy = -1; |
6396 | __task_rq_unlock(rq); | 4677 | __task_rq_unlock(rq); |
6397 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4678 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
6398 | goto recheck; | 4679 | goto recheck; |
6399 | } | 4680 | } |
6400 | update_rq_clock(rq); | 4681 | update_rq_clock(rq); |
@@ -6411,6 +4692,7 @@ recheck: | |||
6411 | litmus_exit_task(p); | 4692 | litmus_exit_task(p); |
6412 | 4693 | ||
6413 | oldprio = p->prio; | 4694 | oldprio = p->prio; |
4695 | prev_class = p->sched_class; | ||
6414 | __setscheduler(rq, p, policy, param->sched_priority); | 4696 | __setscheduler(rq, p, policy, param->sched_priority); |
6415 | 4697 | ||
6416 | if (policy == SCHED_LITMUS) { | 4698 | if (policy == SCHED_LITMUS) { |
@@ -6427,7 +4709,7 @@ recheck: | |||
6427 | check_class_changed(rq, p, prev_class, oldprio, running); | 4709 | check_class_changed(rq, p, prev_class, oldprio, running); |
6428 | } | 4710 | } |
6429 | __task_rq_unlock(rq); | 4711 | __task_rq_unlock(rq); |
6430 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4712 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
6431 | 4713 | ||
6432 | rt_mutex_adjust_pi(p); | 4714 | rt_mutex_adjust_pi(p); |
6433 | 4715 | ||
@@ -6527,7 +4809,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6527 | return -EINVAL; | 4809 | return -EINVAL; |
6528 | 4810 | ||
6529 | retval = -ESRCH; | 4811 | retval = -ESRCH; |
6530 | read_lock(&tasklist_lock); | 4812 | rcu_read_lock(); |
6531 | p = find_process_by_pid(pid); | 4813 | p = find_process_by_pid(pid); |
6532 | if (p) { | 4814 | if (p) { |
6533 | retval = security_task_getscheduler(p); | 4815 | retval = security_task_getscheduler(p); |
@@ -6535,7 +4817,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6535 | retval = p->policy | 4817 | retval = p->policy |
6536 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | 4818 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); |
6537 | } | 4819 | } |
6538 | read_unlock(&tasklist_lock); | 4820 | rcu_read_unlock(); |
6539 | return retval; | 4821 | return retval; |
6540 | } | 4822 | } |
6541 | 4823 | ||
@@ -6553,7 +4835,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
6553 | if (!param || pid < 0) | 4835 | if (!param || pid < 0) |
6554 | return -EINVAL; | 4836 | return -EINVAL; |
6555 | 4837 | ||
6556 | read_lock(&tasklist_lock); | 4838 | rcu_read_lock(); |
6557 | p = find_process_by_pid(pid); | 4839 | p = find_process_by_pid(pid); |
6558 | retval = -ESRCH; | 4840 | retval = -ESRCH; |
6559 | if (!p) | 4841 | if (!p) |
@@ -6564,7 +4846,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
6564 | goto out_unlock; | 4846 | goto out_unlock; |
6565 | 4847 | ||
6566 | lp.sched_priority = p->rt_priority; | 4848 | lp.sched_priority = p->rt_priority; |
6567 | read_unlock(&tasklist_lock); | 4849 | rcu_read_unlock(); |
6568 | 4850 | ||
6569 | /* | 4851 | /* |
6570 | * This one might sleep, we cannot do it with a spinlock held ... | 4852 | * This one might sleep, we cannot do it with a spinlock held ... |
@@ -6574,7 +4856,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
6574 | return retval; | 4856 | return retval; |
6575 | 4857 | ||
6576 | out_unlock: | 4858 | out_unlock: |
6577 | read_unlock(&tasklist_lock); | 4859 | rcu_read_unlock(); |
6578 | return retval; | 4860 | return retval; |
6579 | } | 4861 | } |
6580 | 4862 | ||
@@ -6585,23 +4867,19 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
6585 | int retval; | 4867 | int retval; |
6586 | 4868 | ||
6587 | get_online_cpus(); | 4869 | get_online_cpus(); |
6588 | read_lock(&tasklist_lock); | 4870 | rcu_read_lock(); |
6589 | 4871 | ||
6590 | p = find_process_by_pid(pid); | 4872 | p = find_process_by_pid(pid); |
6591 | /* Don't set affinity if task not found and for LITMUS tasks */ | 4873 | /* Don't set affinity if task not found and for LITMUS tasks */ |
6592 | if (!p || is_realtime(p)) { | 4874 | if (!p || is_realtime(p)) { |
6593 | read_unlock(&tasklist_lock); | 4875 | rcu_read_unlock(); |
6594 | put_online_cpus(); | 4876 | put_online_cpus(); |
6595 | return p ? -EPERM : -ESRCH; | 4877 | return p ? -EPERM : -ESRCH; |
6596 | } | 4878 | } |
6597 | 4879 | ||
6598 | /* | 4880 | /* Prevent p going away */ |
6599 | * It is not safe to call set_cpus_allowed with the | ||
6600 | * tasklist_lock held. We will bump the task_struct's | ||
6601 | * usage count and then drop tasklist_lock. | ||
6602 | */ | ||
6603 | get_task_struct(p); | 4881 | get_task_struct(p); |
6604 | read_unlock(&tasklist_lock); | 4882 | rcu_read_unlock(); |
6605 | 4883 | ||
6606 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | 4884 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { |
6607 | retval = -ENOMEM; | 4885 | retval = -ENOMEM; |
@@ -6682,10 +4960,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | |||
6682 | long sched_getaffinity(pid_t pid, struct cpumask *mask) | 4960 | long sched_getaffinity(pid_t pid, struct cpumask *mask) |
6683 | { | 4961 | { |
6684 | struct task_struct *p; | 4962 | struct task_struct *p; |
4963 | unsigned long flags; | ||
4964 | struct rq *rq; | ||
6685 | int retval; | 4965 | int retval; |
6686 | 4966 | ||
6687 | get_online_cpus(); | 4967 | get_online_cpus(); |
6688 | read_lock(&tasklist_lock); | 4968 | rcu_read_lock(); |
6689 | 4969 | ||
6690 | retval = -ESRCH; | 4970 | retval = -ESRCH; |
6691 | p = find_process_by_pid(pid); | 4971 | p = find_process_by_pid(pid); |
@@ -6696,10 +4976,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
6696 | if (retval) | 4976 | if (retval) |
6697 | goto out_unlock; | 4977 | goto out_unlock; |
6698 | 4978 | ||
4979 | rq = task_rq_lock(p, &flags); | ||
6699 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 4980 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
4981 | task_rq_unlock(rq, &flags); | ||
6700 | 4982 | ||
6701 | out_unlock: | 4983 | out_unlock: |
6702 | read_unlock(&tasklist_lock); | 4984 | rcu_read_unlock(); |
6703 | put_online_cpus(); | 4985 | put_online_cpus(); |
6704 | 4986 | ||
6705 | return retval; | 4987 | return retval; |
@@ -6717,7 +4999,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
6717 | int ret; | 4999 | int ret; |
6718 | cpumask_var_t mask; | 5000 | cpumask_var_t mask; |
6719 | 5001 | ||
6720 | if (len < cpumask_size()) | 5002 | if ((len * BITS_PER_BYTE) < nr_cpu_ids) |
5003 | return -EINVAL; | ||
5004 | if (len & (sizeof(unsigned long)-1)) | ||
6721 | return -EINVAL; | 5005 | return -EINVAL; |
6722 | 5006 | ||
6723 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 5007 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
@@ -6725,10 +5009,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
6725 | 5009 | ||
6726 | ret = sched_getaffinity(pid, mask); | 5010 | ret = sched_getaffinity(pid, mask); |
6727 | if (ret == 0) { | 5011 | if (ret == 0) { |
6728 | if (copy_to_user(user_mask_ptr, mask, cpumask_size())) | 5012 | size_t retlen = min_t(size_t, len, cpumask_size()); |
5013 | |||
5014 | if (copy_to_user(user_mask_ptr, mask, retlen)) | ||
6729 | ret = -EFAULT; | 5015 | ret = -EFAULT; |
6730 | else | 5016 | else |
6731 | ret = cpumask_size(); | 5017 | ret = retlen; |
6732 | } | 5018 | } |
6733 | free_cpumask_var(mask); | 5019 | free_cpumask_var(mask); |
6734 | 5020 | ||
@@ -6754,7 +5040,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
6754 | */ | 5040 | */ |
6755 | __release(rq->lock); | 5041 | __release(rq->lock); |
6756 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 5042 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
6757 | _raw_spin_unlock(&rq->lock); | 5043 | do_raw_spin_unlock(&rq->lock); |
6758 | preempt_enable_no_resched(); | 5044 | preempt_enable_no_resched(); |
6759 | 5045 | ||
6760 | schedule(); | 5046 | schedule(); |
@@ -6934,6 +5220,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
6934 | { | 5220 | { |
6935 | struct task_struct *p; | 5221 | struct task_struct *p; |
6936 | unsigned int time_slice; | 5222 | unsigned int time_slice; |
5223 | unsigned long flags; | ||
5224 | struct rq *rq; | ||
6937 | int retval; | 5225 | int retval; |
6938 | struct timespec t; | 5226 | struct timespec t; |
6939 | 5227 | ||
@@ -6941,7 +5229,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
6941 | return -EINVAL; | 5229 | return -EINVAL; |
6942 | 5230 | ||
6943 | retval = -ESRCH; | 5231 | retval = -ESRCH; |
6944 | read_lock(&tasklist_lock); | 5232 | rcu_read_lock(); |
6945 | p = find_process_by_pid(pid); | 5233 | p = find_process_by_pid(pid); |
6946 | if (!p) | 5234 | if (!p) |
6947 | goto out_unlock; | 5235 | goto out_unlock; |
@@ -6950,15 +5238,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
6950 | if (retval) | 5238 | if (retval) |
6951 | goto out_unlock; | 5239 | goto out_unlock; |
6952 | 5240 | ||
6953 | time_slice = p->sched_class->get_rr_interval(p); | 5241 | rq = task_rq_lock(p, &flags); |
5242 | time_slice = p->sched_class->get_rr_interval(rq, p); | ||
5243 | task_rq_unlock(rq, &flags); | ||
6954 | 5244 | ||
6955 | read_unlock(&tasklist_lock); | 5245 | rcu_read_unlock(); |
6956 | jiffies_to_timespec(time_slice, &t); | 5246 | jiffies_to_timespec(time_slice, &t); |
6957 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 5247 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
6958 | return retval; | 5248 | return retval; |
6959 | 5249 | ||
6960 | out_unlock: | 5250 | out_unlock: |
6961 | read_unlock(&tasklist_lock); | 5251 | rcu_read_unlock(); |
6962 | return retval; | 5252 | return retval; |
6963 | } | 5253 | } |
6964 | 5254 | ||
@@ -7024,7 +5314,7 @@ void show_state_filter(unsigned long state_filter) | |||
7024 | /* | 5314 | /* |
7025 | * Only show locks if all tasks are dumped: | 5315 | * Only show locks if all tasks are dumped: |
7026 | */ | 5316 | */ |
7027 | if (state_filter == -1) | 5317 | if (!state_filter) |
7028 | debug_show_all_locks(); | 5318 | debug_show_all_locks(); |
7029 | } | 5319 | } |
7030 | 5320 | ||
@@ -7046,12 +5336,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
7046 | struct rq *rq = cpu_rq(cpu); | 5336 | struct rq *rq = cpu_rq(cpu); |
7047 | unsigned long flags; | 5337 | unsigned long flags; |
7048 | 5338 | ||
7049 | spin_lock_irqsave(&rq->lock, flags); | 5339 | raw_spin_lock_irqsave(&rq->lock, flags); |
7050 | 5340 | ||
7051 | __sched_fork(idle); | 5341 | __sched_fork(idle); |
5342 | idle->state = TASK_RUNNING; | ||
7052 | idle->se.exec_start = sched_clock(); | 5343 | idle->se.exec_start = sched_clock(); |
7053 | 5344 | ||
7054 | idle->prio = idle->normal_prio = MAX_PRIO; | ||
7055 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5345 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
7056 | __set_task_cpu(idle, cpu); | 5346 | __set_task_cpu(idle, cpu); |
7057 | 5347 | ||
@@ -7059,7 +5349,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
7059 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5349 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
7060 | idle->oncpu = 1; | 5350 | idle->oncpu = 1; |
7061 | #endif | 5351 | #endif |
7062 | spin_unlock_irqrestore(&rq->lock, flags); | 5352 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7063 | 5353 | ||
7064 | /* Set the preempt count _outside_ the spinlocks! */ | 5354 | /* Set the preempt count _outside_ the spinlocks! */ |
7065 | #if defined(CONFIG_PREEMPT) | 5355 | #if defined(CONFIG_PREEMPT) |
@@ -7092,22 +5382,43 @@ cpumask_var_t nohz_cpu_mask; | |||
7092 | * | 5382 | * |
7093 | * This idea comes from the SD scheduler of Con Kolivas: | 5383 | * This idea comes from the SD scheduler of Con Kolivas: |
7094 | */ | 5384 | */ |
7095 | static inline void sched_init_granularity(void) | 5385 | static int get_update_sysctl_factor(void) |
7096 | { | 5386 | { |
7097 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 5387 | unsigned int cpus = min_t(int, num_online_cpus(), 8); |
7098 | const unsigned long limit = 200000000; | 5388 | unsigned int factor; |
5389 | |||
5390 | switch (sysctl_sched_tunable_scaling) { | ||
5391 | case SCHED_TUNABLESCALING_NONE: | ||
5392 | factor = 1; | ||
5393 | break; | ||
5394 | case SCHED_TUNABLESCALING_LINEAR: | ||
5395 | factor = cpus; | ||
5396 | break; | ||
5397 | case SCHED_TUNABLESCALING_LOG: | ||
5398 | default: | ||
5399 | factor = 1 + ilog2(cpus); | ||
5400 | break; | ||
5401 | } | ||
7099 | 5402 | ||
7100 | sysctl_sched_min_granularity *= factor; | 5403 | return factor; |
7101 | if (sysctl_sched_min_granularity > limit) | 5404 | } |
7102 | sysctl_sched_min_granularity = limit; | ||
7103 | 5405 | ||
7104 | sysctl_sched_latency *= factor; | 5406 | static void update_sysctl(void) |
7105 | if (sysctl_sched_latency > limit) | 5407 | { |
7106 | sysctl_sched_latency = limit; | 5408 | unsigned int factor = get_update_sysctl_factor(); |
7107 | 5409 | ||
7108 | sysctl_sched_wakeup_granularity *= factor; | 5410 | #define SET_SYSCTL(name) \ |
5411 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
5412 | SET_SYSCTL(sched_min_granularity); | ||
5413 | SET_SYSCTL(sched_latency); | ||
5414 | SET_SYSCTL(sched_wakeup_granularity); | ||
5415 | SET_SYSCTL(sched_shares_ratelimit); | ||
5416 | #undef SET_SYSCTL | ||
5417 | } | ||
7109 | 5418 | ||
7110 | sysctl_sched_shares_ratelimit *= factor; | 5419 | static inline void sched_init_granularity(void) |
5420 | { | ||
5421 | update_sysctl(); | ||
7111 | } | 5422 | } |
7112 | 5423 | ||
7113 | #ifdef CONFIG_SMP | 5424 | #ifdef CONFIG_SMP |
@@ -7144,7 +5455,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7144 | int ret = 0; | 5455 | int ret = 0; |
7145 | 5456 | ||
7146 | rq = task_rq_lock(p, &flags); | 5457 | rq = task_rq_lock(p, &flags); |
7147 | if (!cpumask_intersects(new_mask, cpu_online_mask)) { | 5458 | |
5459 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | ||
7148 | ret = -EINVAL; | 5460 | ret = -EINVAL; |
7149 | goto out; | 5461 | goto out; |
7150 | } | 5462 | } |
@@ -7166,13 +5478,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7166 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 5478 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
7167 | goto out; | 5479 | goto out; |
7168 | 5480 | ||
7169 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 5481 | if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { |
7170 | /* Need help from migration thread: drop lock and wait. */ | 5482 | /* Need help from migration thread: drop lock and wait. */ |
7171 | struct task_struct *mt = rq->migration_thread; | 5483 | struct task_struct *mt = rq->migration_thread; |
7172 | 5484 | ||
7173 | get_task_struct(mt); | 5485 | get_task_struct(mt); |
7174 | task_rq_unlock(rq, &flags); | 5486 | task_rq_unlock(rq, &flags); |
7175 | wake_up_process(rq->migration_thread); | 5487 | wake_up_process(mt); |
7176 | put_task_struct(mt); | 5488 | put_task_struct(mt); |
7177 | wait_for_completion(&req.done); | 5489 | wait_for_completion(&req.done); |
7178 | tlb_migrate_finish(p->mm); | 5490 | tlb_migrate_finish(p->mm); |
@@ -7199,7 +5511,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
7199 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 5511 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
7200 | { | 5512 | { |
7201 | struct rq *rq_dest, *rq_src; | 5513 | struct rq *rq_dest, *rq_src; |
7202 | int ret = 0, on_rq; | 5514 | int ret = 0; |
7203 | 5515 | ||
7204 | if (unlikely(!cpu_active(dest_cpu))) | 5516 | if (unlikely(!cpu_active(dest_cpu))) |
7205 | return ret; | 5517 | return ret; |
@@ -7215,12 +5527,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
7215 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 5527 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
7216 | goto fail; | 5528 | goto fail; |
7217 | 5529 | ||
7218 | on_rq = p->se.on_rq; | 5530 | /* |
7219 | if (on_rq) | 5531 | * If we're not on a rq, the next wake-up will ensure we're |
5532 | * placed properly. | ||
5533 | */ | ||
5534 | if (p->se.on_rq) { | ||
7220 | deactivate_task(rq_src, p, 0); | 5535 | deactivate_task(rq_src, p, 0); |
7221 | 5536 | set_task_cpu(p, dest_cpu); | |
7222 | set_task_cpu(p, dest_cpu); | ||
7223 | if (on_rq) { | ||
7224 | activate_task(rq_dest, p, 0); | 5537 | activate_task(rq_dest, p, 0); |
7225 | check_preempt_curr(rq_dest, p, 0); | 5538 | check_preempt_curr(rq_dest, p, 0); |
7226 | } | 5539 | } |
@@ -7255,10 +5568,10 @@ static int migration_thread(void *data) | |||
7255 | struct migration_req *req; | 5568 | struct migration_req *req; |
7256 | struct list_head *head; | 5569 | struct list_head *head; |
7257 | 5570 | ||
7258 | spin_lock_irq(&rq->lock); | 5571 | raw_spin_lock_irq(&rq->lock); |
7259 | 5572 | ||
7260 | if (cpu_is_offline(cpu)) { | 5573 | if (cpu_is_offline(cpu)) { |
7261 | spin_unlock_irq(&rq->lock); | 5574 | raw_spin_unlock_irq(&rq->lock); |
7262 | break; | 5575 | break; |
7263 | } | 5576 | } |
7264 | 5577 | ||
@@ -7270,7 +5583,7 @@ static int migration_thread(void *data) | |||
7270 | head = &rq->migration_queue; | 5583 | head = &rq->migration_queue; |
7271 | 5584 | ||
7272 | if (list_empty(head)) { | 5585 | if (list_empty(head)) { |
7273 | spin_unlock_irq(&rq->lock); | 5586 | raw_spin_unlock_irq(&rq->lock); |
7274 | schedule(); | 5587 | schedule(); |
7275 | set_current_state(TASK_INTERRUPTIBLE); | 5588 | set_current_state(TASK_INTERRUPTIBLE); |
7276 | continue; | 5589 | continue; |
@@ -7279,14 +5592,14 @@ static int migration_thread(void *data) | |||
7279 | list_del_init(head->next); | 5592 | list_del_init(head->next); |
7280 | 5593 | ||
7281 | if (req->task != NULL) { | 5594 | if (req->task != NULL) { |
7282 | spin_unlock(&rq->lock); | 5595 | raw_spin_unlock(&rq->lock); |
7283 | __migrate_task(req->task, cpu, req->dest_cpu); | 5596 | __migrate_task(req->task, cpu, req->dest_cpu); |
7284 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | 5597 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { |
7285 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | 5598 | req->dest_cpu = RCU_MIGRATION_GOT_QS; |
7286 | spin_unlock(&rq->lock); | 5599 | raw_spin_unlock(&rq->lock); |
7287 | } else { | 5600 | } else { |
7288 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | 5601 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; |
7289 | spin_unlock(&rq->lock); | 5602 | raw_spin_unlock(&rq->lock); |
7290 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | 5603 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); |
7291 | } | 5604 | } |
7292 | local_irq_enable(); | 5605 | local_irq_enable(); |
@@ -7316,37 +5629,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) | |||
7316 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5629 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
7317 | { | 5630 | { |
7318 | int dest_cpu; | 5631 | int dest_cpu; |
7319 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); | ||
7320 | 5632 | ||
7321 | again: | 5633 | again: |
7322 | /* Look for allowed, online CPU in same node. */ | 5634 | dest_cpu = select_fallback_rq(dead_cpu, p); |
7323 | for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) | ||
7324 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | ||
7325 | goto move; | ||
7326 | |||
7327 | /* Any allowed, online CPU? */ | ||
7328 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); | ||
7329 | if (dest_cpu < nr_cpu_ids) | ||
7330 | goto move; | ||
7331 | |||
7332 | /* No more Mr. Nice Guy. */ | ||
7333 | if (dest_cpu >= nr_cpu_ids) { | ||
7334 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); | ||
7335 | dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); | ||
7336 | |||
7337 | /* | ||
7338 | * Don't tell them about moving exiting tasks or | ||
7339 | * kernel threads (both mm NULL), since they never | ||
7340 | * leave kernel. | ||
7341 | */ | ||
7342 | if (p->mm && printk_ratelimit()) { | ||
7343 | printk(KERN_INFO "process %d (%s) no " | ||
7344 | "longer affine to cpu%d\n", | ||
7345 | task_pid_nr(p), p->comm, dead_cpu); | ||
7346 | } | ||
7347 | } | ||
7348 | 5635 | ||
7349 | move: | ||
7350 | /* It can have affinity changed while we were choosing. */ | 5636 | /* It can have affinity changed while we were choosing. */ |
7351 | if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) | 5637 | if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) |
7352 | goto again; | 5638 | goto again; |
@@ -7361,7 +5647,7 @@ move: | |||
7361 | */ | 5647 | */ |
7362 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5648 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
7363 | { | 5649 | { |
7364 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); | 5650 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
7365 | unsigned long flags; | 5651 | unsigned long flags; |
7366 | 5652 | ||
7367 | local_irq_save(flags); | 5653 | local_irq_save(flags); |
@@ -7409,14 +5695,14 @@ void sched_idle_next(void) | |||
7409 | * Strictly not necessary since rest of the CPUs are stopped by now | 5695 | * Strictly not necessary since rest of the CPUs are stopped by now |
7410 | * and interrupts disabled on the current cpu. | 5696 | * and interrupts disabled on the current cpu. |
7411 | */ | 5697 | */ |
7412 | spin_lock_irqsave(&rq->lock, flags); | 5698 | raw_spin_lock_irqsave(&rq->lock, flags); |
7413 | 5699 | ||
7414 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5700 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
7415 | 5701 | ||
7416 | update_rq_clock(rq); | 5702 | update_rq_clock(rq); |
7417 | activate_task(rq, p, 0); | 5703 | activate_task(rq, p, 0); |
7418 | 5704 | ||
7419 | spin_unlock_irqrestore(&rq->lock, flags); | 5705 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7420 | } | 5706 | } |
7421 | 5707 | ||
7422 | /* | 5708 | /* |
@@ -7452,9 +5738,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
7452 | * that's OK. No task can be added to this CPU, so iteration is | 5738 | * that's OK. No task can be added to this CPU, so iteration is |
7453 | * fine. | 5739 | * fine. |
7454 | */ | 5740 | */ |
7455 | spin_unlock_irq(&rq->lock); | 5741 | raw_spin_unlock_irq(&rq->lock); |
7456 | move_task_off_dead_cpu(dead_cpu, p); | 5742 | move_task_off_dead_cpu(dead_cpu, p); |
7457 | spin_lock_irq(&rq->lock); | 5743 | raw_spin_lock_irq(&rq->lock); |
7458 | 5744 | ||
7459 | put_task_struct(p); | 5745 | put_task_struct(p); |
7460 | } | 5746 | } |
@@ -7495,17 +5781,16 @@ static struct ctl_table sd_ctl_dir[] = { | |||
7495 | .procname = "sched_domain", | 5781 | .procname = "sched_domain", |
7496 | .mode = 0555, | 5782 | .mode = 0555, |
7497 | }, | 5783 | }, |
7498 | {0, }, | 5784 | {} |
7499 | }; | 5785 | }; |
7500 | 5786 | ||
7501 | static struct ctl_table sd_ctl_root[] = { | 5787 | static struct ctl_table sd_ctl_root[] = { |
7502 | { | 5788 | { |
7503 | .ctl_name = CTL_KERN, | ||
7504 | .procname = "kernel", | 5789 | .procname = "kernel", |
7505 | .mode = 0555, | 5790 | .mode = 0555, |
7506 | .child = sd_ctl_dir, | 5791 | .child = sd_ctl_dir, |
7507 | }, | 5792 | }, |
7508 | {0, }, | 5793 | {} |
7509 | }; | 5794 | }; |
7510 | 5795 | ||
7511 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5796 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
@@ -7615,7 +5900,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
7615 | static struct ctl_table_header *sd_sysctl_header; | 5900 | static struct ctl_table_header *sd_sysctl_header; |
7616 | static void register_sched_domain_sysctl(void) | 5901 | static void register_sched_domain_sysctl(void) |
7617 | { | 5902 | { |
7618 | int i, cpu_num = num_online_cpus(); | 5903 | int i, cpu_num = num_possible_cpus(); |
7619 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5904 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
7620 | char buf[32]; | 5905 | char buf[32]; |
7621 | 5906 | ||
@@ -7625,7 +5910,7 @@ static void register_sched_domain_sysctl(void) | |||
7625 | if (entry == NULL) | 5910 | if (entry == NULL) |
7626 | return; | 5911 | return; |
7627 | 5912 | ||
7628 | for_each_online_cpu(i) { | 5913 | for_each_possible_cpu(i) { |
7629 | snprintf(buf, 32, "cpu%d", i); | 5914 | snprintf(buf, 32, "cpu%d", i); |
7630 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5915 | entry->procname = kstrdup(buf, GFP_KERNEL); |
7631 | entry->mode = 0555; | 5916 | entry->mode = 0555; |
@@ -7721,13 +6006,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7721 | 6006 | ||
7722 | /* Update our root-domain */ | 6007 | /* Update our root-domain */ |
7723 | rq = cpu_rq(cpu); | 6008 | rq = cpu_rq(cpu); |
7724 | spin_lock_irqsave(&rq->lock, flags); | 6009 | raw_spin_lock_irqsave(&rq->lock, flags); |
7725 | if (rq->rd) { | 6010 | if (rq->rd) { |
7726 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6011 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7727 | 6012 | ||
7728 | set_rq_online(rq); | 6013 | set_rq_online(rq); |
7729 | } | 6014 | } |
7730 | spin_unlock_irqrestore(&rq->lock, flags); | 6015 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7731 | break; | 6016 | break; |
7732 | 6017 | ||
7733 | #ifdef CONFIG_HOTPLUG_CPU | 6018 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -7752,14 +6037,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7752 | put_task_struct(rq->migration_thread); | 6037 | put_task_struct(rq->migration_thread); |
7753 | rq->migration_thread = NULL; | 6038 | rq->migration_thread = NULL; |
7754 | /* Idle task back to normal (off runqueue, low prio) */ | 6039 | /* Idle task back to normal (off runqueue, low prio) */ |
7755 | spin_lock_irq(&rq->lock); | 6040 | raw_spin_lock_irq(&rq->lock); |
7756 | update_rq_clock(rq); | 6041 | update_rq_clock(rq); |
7757 | deactivate_task(rq, rq->idle, 0); | 6042 | deactivate_task(rq, rq->idle, 0); |
7758 | rq->idle->static_prio = MAX_PRIO; | ||
7759 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | 6043 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
7760 | rq->idle->sched_class = &idle_sched_class; | 6044 | rq->idle->sched_class = &idle_sched_class; |
7761 | migrate_dead_tasks(cpu); | 6045 | migrate_dead_tasks(cpu); |
7762 | spin_unlock_irq(&rq->lock); | 6046 | raw_spin_unlock_irq(&rq->lock); |
7763 | cpuset_unlock(); | 6047 | cpuset_unlock(); |
7764 | migrate_nr_uninterruptible(rq); | 6048 | migrate_nr_uninterruptible(rq); |
7765 | BUG_ON(rq->nr_running != 0); | 6049 | BUG_ON(rq->nr_running != 0); |
@@ -7769,30 +6053,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7769 | * they didn't take sched_hotcpu_mutex. Just wake up | 6053 | * they didn't take sched_hotcpu_mutex. Just wake up |
7770 | * the requestors. | 6054 | * the requestors. |
7771 | */ | 6055 | */ |
7772 | spin_lock_irq(&rq->lock); | 6056 | raw_spin_lock_irq(&rq->lock); |
7773 | while (!list_empty(&rq->migration_queue)) { | 6057 | while (!list_empty(&rq->migration_queue)) { |
7774 | struct migration_req *req; | 6058 | struct migration_req *req; |
7775 | 6059 | ||
7776 | req = list_entry(rq->migration_queue.next, | 6060 | req = list_entry(rq->migration_queue.next, |
7777 | struct migration_req, list); | 6061 | struct migration_req, list); |
7778 | list_del_init(&req->list); | 6062 | list_del_init(&req->list); |
7779 | spin_unlock_irq(&rq->lock); | 6063 | raw_spin_unlock_irq(&rq->lock); |
7780 | complete(&req->done); | 6064 | complete(&req->done); |
7781 | spin_lock_irq(&rq->lock); | 6065 | raw_spin_lock_irq(&rq->lock); |
7782 | } | 6066 | } |
7783 | spin_unlock_irq(&rq->lock); | 6067 | raw_spin_unlock_irq(&rq->lock); |
7784 | break; | 6068 | break; |
7785 | 6069 | ||
7786 | case CPU_DYING: | 6070 | case CPU_DYING: |
7787 | case CPU_DYING_FROZEN: | 6071 | case CPU_DYING_FROZEN: |
7788 | /* Update our root-domain */ | 6072 | /* Update our root-domain */ |
7789 | rq = cpu_rq(cpu); | 6073 | rq = cpu_rq(cpu); |
7790 | spin_lock_irqsave(&rq->lock, flags); | 6074 | raw_spin_lock_irqsave(&rq->lock, flags); |
7791 | if (rq->rd) { | 6075 | if (rq->rd) { |
7792 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6076 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7793 | set_rq_offline(rq); | 6077 | set_rq_offline(rq); |
7794 | } | 6078 | } |
7795 | spin_unlock_irqrestore(&rq->lock, flags); | 6079 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7796 | break; | 6080 | break; |
7797 | #endif | 6081 | #endif |
7798 | } | 6082 | } |
@@ -7829,6 +6113,16 @@ early_initcall(migration_init); | |||
7829 | 6113 | ||
7830 | #ifdef CONFIG_SCHED_DEBUG | 6114 | #ifdef CONFIG_SCHED_DEBUG |
7831 | 6115 | ||
6116 | static __read_mostly int sched_domain_debug_enabled; | ||
6117 | |||
6118 | static int __init sched_domain_debug_setup(char *str) | ||
6119 | { | ||
6120 | sched_domain_debug_enabled = 1; | ||
6121 | |||
6122 | return 0; | ||
6123 | } | ||
6124 | early_param("sched_debug", sched_domain_debug_setup); | ||
6125 | |||
7832 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6126 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
7833 | struct cpumask *groupmask) | 6127 | struct cpumask *groupmask) |
7834 | { | 6128 | { |
@@ -7915,6 +6209,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
7915 | cpumask_var_t groupmask; | 6209 | cpumask_var_t groupmask; |
7916 | int level = 0; | 6210 | int level = 0; |
7917 | 6211 | ||
6212 | if (!sched_domain_debug_enabled) | ||
6213 | return; | ||
6214 | |||
7918 | if (!sd) { | 6215 | if (!sd) { |
7919 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 6216 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
7920 | return; | 6217 | return; |
@@ -7994,6 +6291,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
7994 | 6291 | ||
7995 | static void free_rootdomain(struct root_domain *rd) | 6292 | static void free_rootdomain(struct root_domain *rd) |
7996 | { | 6293 | { |
6294 | synchronize_sched(); | ||
6295 | |||
7997 | cpupri_cleanup(&rd->cpupri); | 6296 | cpupri_cleanup(&rd->cpupri); |
7998 | 6297 | ||
7999 | free_cpumask_var(rd->rto_mask); | 6298 | free_cpumask_var(rd->rto_mask); |
@@ -8007,7 +6306,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
8007 | struct root_domain *old_rd = NULL; | 6306 | struct root_domain *old_rd = NULL; |
8008 | unsigned long flags; | 6307 | unsigned long flags; |
8009 | 6308 | ||
8010 | spin_lock_irqsave(&rq->lock, flags); | 6309 | raw_spin_lock_irqsave(&rq->lock, flags); |
8011 | 6310 | ||
8012 | if (rq->rd) { | 6311 | if (rq->rd) { |
8013 | old_rd = rq->rd; | 6312 | old_rd = rq->rd; |
@@ -8033,7 +6332,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
8033 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | 6332 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
8034 | set_rq_online(rq); | 6333 | set_rq_online(rq); |
8035 | 6334 | ||
8036 | spin_unlock_irqrestore(&rq->lock, flags); | 6335 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8037 | 6336 | ||
8038 | if (old_rd) | 6337 | if (old_rd) |
8039 | free_rootdomain(old_rd); | 6338 | free_rootdomain(old_rd); |
@@ -8134,6 +6433,7 @@ static cpumask_var_t cpu_isolated_map; | |||
8134 | /* Setup the mask of cpus configured for isolated domains */ | 6433 | /* Setup the mask of cpus configured for isolated domains */ |
8135 | static int __init isolated_cpu_setup(char *str) | 6434 | static int __init isolated_cpu_setup(char *str) |
8136 | { | 6435 | { |
6436 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
8137 | cpulist_parse(str, cpu_isolated_map); | 6437 | cpulist_parse(str, cpu_isolated_map); |
8138 | return 1; | 6438 | return 1; |
8139 | } | 6439 | } |
@@ -8318,14 +6618,14 @@ enum s_alloc { | |||
8318 | */ | 6618 | */ |
8319 | #ifdef CONFIG_SCHED_SMT | 6619 | #ifdef CONFIG_SCHED_SMT |
8320 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | 6620 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); |
8321 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); | 6621 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); |
8322 | 6622 | ||
8323 | static int | 6623 | static int |
8324 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 6624 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, |
8325 | struct sched_group **sg, struct cpumask *unused) | 6625 | struct sched_group **sg, struct cpumask *unused) |
8326 | { | 6626 | { |
8327 | if (sg) | 6627 | if (sg) |
8328 | *sg = &per_cpu(sched_group_cpus, cpu).sg; | 6628 | *sg = &per_cpu(sched_groups, cpu).sg; |
8329 | return cpu; | 6629 | return cpu; |
8330 | } | 6630 | } |
8331 | #endif /* CONFIG_SCHED_SMT */ | 6631 | #endif /* CONFIG_SCHED_SMT */ |
@@ -8970,7 +7270,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) | |||
8970 | return __build_sched_domains(cpu_map, NULL); | 7270 | return __build_sched_domains(cpu_map, NULL); |
8971 | } | 7271 | } |
8972 | 7272 | ||
8973 | static struct cpumask *doms_cur; /* current sched domains */ | 7273 | static cpumask_var_t *doms_cur; /* current sched domains */ |
8974 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7274 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
8975 | static struct sched_domain_attr *dattr_cur; | 7275 | static struct sched_domain_attr *dattr_cur; |
8976 | /* attribues of custom domains in 'doms_cur' */ | 7276 | /* attribues of custom domains in 'doms_cur' */ |
@@ -8992,6 +7292,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) | |||
8992 | return 0; | 7292 | return 0; |
8993 | } | 7293 | } |
8994 | 7294 | ||
7295 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
7296 | { | ||
7297 | int i; | ||
7298 | cpumask_var_t *doms; | ||
7299 | |||
7300 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
7301 | if (!doms) | ||
7302 | return NULL; | ||
7303 | for (i = 0; i < ndoms; i++) { | ||
7304 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
7305 | free_sched_domains(doms, i); | ||
7306 | return NULL; | ||
7307 | } | ||
7308 | } | ||
7309 | return doms; | ||
7310 | } | ||
7311 | |||
7312 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
7313 | { | ||
7314 | unsigned int i; | ||
7315 | for (i = 0; i < ndoms; i++) | ||
7316 | free_cpumask_var(doms[i]); | ||
7317 | kfree(doms); | ||
7318 | } | ||
7319 | |||
8995 | /* | 7320 | /* |
8996 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 7321 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
8997 | * For now this just excludes isolated cpus, but could be used to | 7322 | * For now this just excludes isolated cpus, but could be used to |
@@ -9003,12 +7328,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
9003 | 7328 | ||
9004 | arch_update_cpu_topology(); | 7329 | arch_update_cpu_topology(); |
9005 | ndoms_cur = 1; | 7330 | ndoms_cur = 1; |
9006 | doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); | 7331 | doms_cur = alloc_sched_domains(ndoms_cur); |
9007 | if (!doms_cur) | 7332 | if (!doms_cur) |
9008 | doms_cur = fallback_doms; | 7333 | doms_cur = &fallback_doms; |
9009 | cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); | 7334 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
9010 | dattr_cur = NULL; | 7335 | dattr_cur = NULL; |
9011 | err = build_sched_domains(doms_cur); | 7336 | err = build_sched_domains(doms_cur[0]); |
9012 | register_sched_domain_sysctl(); | 7337 | register_sched_domain_sysctl(); |
9013 | 7338 | ||
9014 | return err; | 7339 | return err; |
@@ -9058,19 +7383,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
9058 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 7383 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
9059 | * It destroys each deleted domain and builds each new domain. | 7384 | * It destroys each deleted domain and builds each new domain. |
9060 | * | 7385 | * |
9061 | * 'doms_new' is an array of cpumask's of length 'ndoms_new'. | 7386 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
9062 | * The masks don't intersect (don't overlap.) We should setup one | 7387 | * The masks don't intersect (don't overlap.) We should setup one |
9063 | * sched domain for each mask. CPUs not in any of the cpumasks will | 7388 | * sched domain for each mask. CPUs not in any of the cpumasks will |
9064 | * not be load balanced. If the same cpumask appears both in the | 7389 | * not be load balanced. If the same cpumask appears both in the |
9065 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 7390 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
9066 | * it as it is. | 7391 | * it as it is. |
9067 | * | 7392 | * |
9068 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 7393 | * The passed in 'doms_new' should be allocated using |
9069 | * ownership of it and will kfree it when done with it. If the caller | 7394 | * alloc_sched_domains. This routine takes ownership of it and will |
9070 | * failed the kmalloc call, then it can pass in doms_new == NULL && | 7395 | * free_sched_domains it when done with it. If the caller failed the |
9071 | * ndoms_new == 1, and partition_sched_domains() will fallback to | 7396 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
9072 | * the single partition 'fallback_doms', it also forces the domains | 7397 | * and partition_sched_domains() will fallback to the single partition |
9073 | * to be rebuilt. | 7398 | * 'fallback_doms', it also forces the domains to be rebuilt. |
9074 | * | 7399 | * |
9075 | * If doms_new == NULL it will be replaced with cpu_online_mask. | 7400 | * If doms_new == NULL it will be replaced with cpu_online_mask. |
9076 | * ndoms_new == 0 is a special case for destroying existing domains, | 7401 | * ndoms_new == 0 is a special case for destroying existing domains, |
@@ -9078,8 +7403,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
9078 | * | 7403 | * |
9079 | * Call with hotplug lock held | 7404 | * Call with hotplug lock held |
9080 | */ | 7405 | */ |
9081 | /* FIXME: Change to struct cpumask *doms_new[] */ | 7406 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
9082 | void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | ||
9083 | struct sched_domain_attr *dattr_new) | 7407 | struct sched_domain_attr *dattr_new) |
9084 | { | 7408 | { |
9085 | int i, j, n; | 7409 | int i, j, n; |
@@ -9098,40 +7422,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | |||
9098 | /* Destroy deleted domains */ | 7422 | /* Destroy deleted domains */ |
9099 | for (i = 0; i < ndoms_cur; i++) { | 7423 | for (i = 0; i < ndoms_cur; i++) { |
9100 | for (j = 0; j < n && !new_topology; j++) { | 7424 | for (j = 0; j < n && !new_topology; j++) { |
9101 | if (cpumask_equal(&doms_cur[i], &doms_new[j]) | 7425 | if (cpumask_equal(doms_cur[i], doms_new[j]) |
9102 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7426 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
9103 | goto match1; | 7427 | goto match1; |
9104 | } | 7428 | } |
9105 | /* no match - a current sched domain not in new doms_new[] */ | 7429 | /* no match - a current sched domain not in new doms_new[] */ |
9106 | detach_destroy_domains(doms_cur + i); | 7430 | detach_destroy_domains(doms_cur[i]); |
9107 | match1: | 7431 | match1: |
9108 | ; | 7432 | ; |
9109 | } | 7433 | } |
9110 | 7434 | ||
9111 | if (doms_new == NULL) { | 7435 | if (doms_new == NULL) { |
9112 | ndoms_cur = 0; | 7436 | ndoms_cur = 0; |
9113 | doms_new = fallback_doms; | 7437 | doms_new = &fallback_doms; |
9114 | cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); | 7438 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
9115 | WARN_ON_ONCE(dattr_new); | 7439 | WARN_ON_ONCE(dattr_new); |
9116 | } | 7440 | } |
9117 | 7441 | ||
9118 | /* Build new domains */ | 7442 | /* Build new domains */ |
9119 | for (i = 0; i < ndoms_new; i++) { | 7443 | for (i = 0; i < ndoms_new; i++) { |
9120 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 7444 | for (j = 0; j < ndoms_cur && !new_topology; j++) { |
9121 | if (cpumask_equal(&doms_new[i], &doms_cur[j]) | 7445 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
9122 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 7446 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
9123 | goto match2; | 7447 | goto match2; |
9124 | } | 7448 | } |
9125 | /* no match - add a new doms_new */ | 7449 | /* no match - add a new doms_new */ |
9126 | __build_sched_domains(doms_new + i, | 7450 | __build_sched_domains(doms_new[i], |
9127 | dattr_new ? dattr_new + i : NULL); | 7451 | dattr_new ? dattr_new + i : NULL); |
9128 | match2: | 7452 | match2: |
9129 | ; | 7453 | ; |
9130 | } | 7454 | } |
9131 | 7455 | ||
9132 | /* Remember the new sched domains */ | 7456 | /* Remember the new sched domains */ |
9133 | if (doms_cur != fallback_doms) | 7457 | if (doms_cur != &fallback_doms) |
9134 | kfree(doms_cur); | 7458 | free_sched_domains(doms_cur, ndoms_cur); |
9135 | kfree(dattr_cur); /* kfree(NULL) is safe */ | 7459 | kfree(dattr_cur); /* kfree(NULL) is safe */ |
9136 | doms_cur = doms_new; | 7460 | doms_cur = doms_new; |
9137 | dattr_cur = dattr_new; | 7461 | dattr_cur = dattr_new; |
@@ -9183,11 +7507,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
9183 | 7507 | ||
9184 | #ifdef CONFIG_SCHED_MC | 7508 | #ifdef CONFIG_SCHED_MC |
9185 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 7509 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, |
7510 | struct sysdev_class_attribute *attr, | ||
9186 | char *page) | 7511 | char *page) |
9187 | { | 7512 | { |
9188 | return sprintf(page, "%u\n", sched_mc_power_savings); | 7513 | return sprintf(page, "%u\n", sched_mc_power_savings); |
9189 | } | 7514 | } |
9190 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 7515 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, |
7516 | struct sysdev_class_attribute *attr, | ||
9191 | const char *buf, size_t count) | 7517 | const char *buf, size_t count) |
9192 | { | 7518 | { |
9193 | return sched_power_savings_store(buf, count, 0); | 7519 | return sched_power_savings_store(buf, count, 0); |
@@ -9199,11 +7525,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | |||
9199 | 7525 | ||
9200 | #ifdef CONFIG_SCHED_SMT | 7526 | #ifdef CONFIG_SCHED_SMT |
9201 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 7527 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, |
7528 | struct sysdev_class_attribute *attr, | ||
9202 | char *page) | 7529 | char *page) |
9203 | { | 7530 | { |
9204 | return sprintf(page, "%u\n", sched_smt_power_savings); | 7531 | return sprintf(page, "%u\n", sched_smt_power_savings); |
9205 | } | 7532 | } |
9206 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 7533 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, |
7534 | struct sysdev_class_attribute *attr, | ||
9207 | const char *buf, size_t count) | 7535 | const char *buf, size_t count) |
9208 | { | 7536 | { |
9209 | return sched_power_savings_store(buf, count, 1); | 7537 | return sched_power_savings_store(buf, count, 1); |
@@ -9242,8 +7570,10 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
9242 | switch (action) { | 7570 | switch (action) { |
9243 | case CPU_ONLINE: | 7571 | case CPU_ONLINE: |
9244 | case CPU_ONLINE_FROZEN: | 7572 | case CPU_ONLINE_FROZEN: |
9245 | case CPU_DEAD: | 7573 | case CPU_DOWN_PREPARE: |
9246 | case CPU_DEAD_FROZEN: | 7574 | case CPU_DOWN_PREPARE_FROZEN: |
7575 | case CPU_DOWN_FAILED: | ||
7576 | case CPU_DOWN_FAILED_FROZEN: | ||
9247 | partition_sched_domains(1, NULL, NULL); | 7577 | partition_sched_domains(1, NULL, NULL); |
9248 | return NOTIFY_OK; | 7578 | return NOTIFY_OK; |
9249 | 7579 | ||
@@ -9290,7 +7620,7 @@ void __init sched_init_smp(void) | |||
9290 | #endif | 7620 | #endif |
9291 | get_online_cpus(); | 7621 | get_online_cpus(); |
9292 | mutex_lock(&sched_domains_mutex); | 7622 | mutex_lock(&sched_domains_mutex); |
9293 | arch_init_sched_domains(cpu_online_mask); | 7623 | arch_init_sched_domains(cpu_active_mask); |
9294 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7624 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
9295 | if (cpumask_empty(non_isolated_cpus)) | 7625 | if (cpumask_empty(non_isolated_cpus)) |
9296 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7626 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -9363,13 +7693,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
9363 | #ifdef CONFIG_SMP | 7693 | #ifdef CONFIG_SMP |
9364 | rt_rq->rt_nr_migratory = 0; | 7694 | rt_rq->rt_nr_migratory = 0; |
9365 | rt_rq->overloaded = 0; | 7695 | rt_rq->overloaded = 0; |
9366 | plist_head_init(&rt_rq->pushable_tasks, &rq->lock); | 7696 | plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); |
9367 | #endif | 7697 | #endif |
9368 | 7698 | ||
9369 | rt_rq->rt_time = 0; | 7699 | rt_rq->rt_time = 0; |
9370 | rt_rq->rt_throttled = 0; | 7700 | rt_rq->rt_throttled = 0; |
9371 | rt_rq->rt_runtime = 0; | 7701 | rt_rq->rt_runtime = 0; |
9372 | spin_lock_init(&rt_rq->rt_runtime_lock); | 7702 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); |
9373 | 7703 | ||
9374 | #ifdef CONFIG_RT_GROUP_SCHED | 7704 | #ifdef CONFIG_RT_GROUP_SCHED |
9375 | rt_rq->rt_nr_boosted = 0; | 7705 | rt_rq->rt_nr_boosted = 0; |
@@ -9416,7 +7746,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
9416 | tg->rt_rq[cpu] = rt_rq; | 7746 | tg->rt_rq[cpu] = rt_rq; |
9417 | init_rt_rq(rt_rq, rq); | 7747 | init_rt_rq(rt_rq, rq); |
9418 | rt_rq->tg = tg; | 7748 | rt_rq->tg = tg; |
9419 | rt_rq->rt_se = rt_se; | ||
9420 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7749 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
9421 | if (add) | 7750 | if (add) |
9422 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7751 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
@@ -9447,16 +7776,9 @@ void __init sched_init(void) | |||
9447 | #ifdef CONFIG_RT_GROUP_SCHED | 7776 | #ifdef CONFIG_RT_GROUP_SCHED |
9448 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7777 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
9449 | #endif | 7778 | #endif |
9450 | #ifdef CONFIG_USER_SCHED | ||
9451 | alloc_size *= 2; | ||
9452 | #endif | ||
9453 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7779 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9454 | alloc_size += num_possible_cpus() * cpumask_size(); | 7780 | alloc_size += num_possible_cpus() * cpumask_size(); |
9455 | #endif | 7781 | #endif |
9456 | /* | ||
9457 | * As sched_init() is called before page_alloc is setup, | ||
9458 | * we use alloc_bootmem(). | ||
9459 | */ | ||
9460 | if (alloc_size) { | 7782 | if (alloc_size) { |
9461 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7783 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
9462 | 7784 | ||
@@ -9467,13 +7789,6 @@ void __init sched_init(void) | |||
9467 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7789 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; |
9468 | ptr += nr_cpu_ids * sizeof(void **); | 7790 | ptr += nr_cpu_ids * sizeof(void **); |
9469 | 7791 | ||
9470 | #ifdef CONFIG_USER_SCHED | ||
9471 | root_task_group.se = (struct sched_entity **)ptr; | ||
9472 | ptr += nr_cpu_ids * sizeof(void **); | ||
9473 | |||
9474 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
9475 | ptr += nr_cpu_ids * sizeof(void **); | ||
9476 | #endif /* CONFIG_USER_SCHED */ | ||
9477 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7792 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9478 | #ifdef CONFIG_RT_GROUP_SCHED | 7793 | #ifdef CONFIG_RT_GROUP_SCHED |
9479 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7794 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
@@ -9482,13 +7797,6 @@ void __init sched_init(void) | |||
9482 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7797 | init_task_group.rt_rq = (struct rt_rq **)ptr; |
9483 | ptr += nr_cpu_ids * sizeof(void **); | 7798 | ptr += nr_cpu_ids * sizeof(void **); |
9484 | 7799 | ||
9485 | #ifdef CONFIG_USER_SCHED | ||
9486 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
9487 | ptr += nr_cpu_ids * sizeof(void **); | ||
9488 | |||
9489 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
9490 | ptr += nr_cpu_ids * sizeof(void **); | ||
9491 | #endif /* CONFIG_USER_SCHED */ | ||
9492 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7800 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9493 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7801 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9494 | for_each_possible_cpu(i) { | 7802 | for_each_possible_cpu(i) { |
@@ -9508,22 +7816,13 @@ void __init sched_init(void) | |||
9508 | #ifdef CONFIG_RT_GROUP_SCHED | 7816 | #ifdef CONFIG_RT_GROUP_SCHED |
9509 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7817 | init_rt_bandwidth(&init_task_group.rt_bandwidth, |
9510 | global_rt_period(), global_rt_runtime()); | 7818 | global_rt_period(), global_rt_runtime()); |
9511 | #ifdef CONFIG_USER_SCHED | ||
9512 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
9513 | global_rt_period(), RUNTIME_INF); | ||
9514 | #endif /* CONFIG_USER_SCHED */ | ||
9515 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7819 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9516 | 7820 | ||
9517 | #ifdef CONFIG_GROUP_SCHED | 7821 | #ifdef CONFIG_CGROUP_SCHED |
9518 | list_add(&init_task_group.list, &task_groups); | 7822 | list_add(&init_task_group.list, &task_groups); |
9519 | INIT_LIST_HEAD(&init_task_group.children); | 7823 | INIT_LIST_HEAD(&init_task_group.children); |
9520 | 7824 | ||
9521 | #ifdef CONFIG_USER_SCHED | 7825 | #endif /* CONFIG_CGROUP_SCHED */ |
9522 | INIT_LIST_HEAD(&root_task_group.children); | ||
9523 | init_task_group.parent = &root_task_group; | ||
9524 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
9525 | #endif /* CONFIG_USER_SCHED */ | ||
9526 | #endif /* CONFIG_GROUP_SCHED */ | ||
9527 | 7826 | ||
9528 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | 7827 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP |
9529 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | 7828 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), |
@@ -9533,7 +7832,7 @@ void __init sched_init(void) | |||
9533 | struct rq *rq; | 7832 | struct rq *rq; |
9534 | 7833 | ||
9535 | rq = cpu_rq(i); | 7834 | rq = cpu_rq(i); |
9536 | spin_lock_init(&rq->lock); | 7835 | raw_spin_lock_init(&rq->lock); |
9537 | rq->nr_running = 0; | 7836 | rq->nr_running = 0; |
9538 | rq->calc_load_active = 0; | 7837 | rq->calc_load_active = 0; |
9539 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7838 | rq->calc_load_update = jiffies + LOAD_FREQ; |
@@ -9563,25 +7862,6 @@ void __init sched_init(void) | |||
9563 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7862 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
9564 | */ | 7863 | */ |
9565 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7864 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); |
9566 | #elif defined CONFIG_USER_SCHED | ||
9567 | root_task_group.shares = NICE_0_LOAD; | ||
9568 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
9569 | /* | ||
9570 | * In case of task-groups formed thr' the user id of tasks, | ||
9571 | * init_task_group represents tasks belonging to root user. | ||
9572 | * Hence it forms a sibling of all subsequent groups formed. | ||
9573 | * In this case, init_task_group gets only a fraction of overall | ||
9574 | * system cpu resource, based on the weight assigned to root | ||
9575 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
9576 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
9577 | * (init_tg_cfs_rq) and having one entity represent this group of | ||
9578 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
9579 | */ | ||
9580 | init_tg_cfs_entry(&init_task_group, | ||
9581 | &per_cpu(init_tg_cfs_rq, i), | ||
9582 | &per_cpu(init_sched_entity, i), i, 1, | ||
9583 | root_task_group.se[i]); | ||
9584 | |||
9585 | #endif | 7865 | #endif |
9586 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7866 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9587 | 7867 | ||
@@ -9590,12 +7870,6 @@ void __init sched_init(void) | |||
9590 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7870 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
9591 | #ifdef CONFIG_CGROUP_SCHED | 7871 | #ifdef CONFIG_CGROUP_SCHED |
9592 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | 7872 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); |
9593 | #elif defined CONFIG_USER_SCHED | ||
9594 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
9595 | init_tg_rt_entry(&init_task_group, | ||
9596 | &per_cpu(init_rt_rq, i), | ||
9597 | &per_cpu(init_sched_rt_entity, i), i, 1, | ||
9598 | root_task_group.rt_se[i]); | ||
9599 | #endif | 7873 | #endif |
9600 | #endif | 7874 | #endif |
9601 | 7875 | ||
@@ -9611,6 +7885,8 @@ void __init sched_init(void) | |||
9611 | rq->cpu = i; | 7885 | rq->cpu = i; |
9612 | rq->online = 0; | 7886 | rq->online = 0; |
9613 | rq->migration_thread = NULL; | 7887 | rq->migration_thread = NULL; |
7888 | rq->idle_stamp = 0; | ||
7889 | rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
9614 | INIT_LIST_HEAD(&rq->migration_queue); | 7890 | INIT_LIST_HEAD(&rq->migration_queue); |
9615 | rq_attach_root(rq, &def_root_domain); | 7891 | rq_attach_root(rq, &def_root_domain); |
9616 | #endif | 7892 | #endif |
@@ -9629,7 +7905,7 @@ void __init sched_init(void) | |||
9629 | #endif | 7905 | #endif |
9630 | 7906 | ||
9631 | #ifdef CONFIG_RT_MUTEXES | 7907 | #ifdef CONFIG_RT_MUTEXES |
9632 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 7908 | plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); |
9633 | #endif | 7909 | #endif |
9634 | 7910 | ||
9635 | /* | 7911 | /* |
@@ -9660,7 +7936,9 @@ void __init sched_init(void) | |||
9660 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 7936 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); |
9661 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 7937 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); |
9662 | #endif | 7938 | #endif |
9663 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7939 | /* May be allocated at isolcpus cmdline parse time */ |
7940 | if (cpu_isolated_map == NULL) | ||
7941 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | ||
9664 | #endif /* SMP */ | 7942 | #endif /* SMP */ |
9665 | 7943 | ||
9666 | perf_event_init(); | 7944 | perf_event_init(); |
@@ -9671,12 +7949,12 @@ void __init sched_init(void) | |||
9671 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 7949 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9672 | static inline int preempt_count_equals(int preempt_offset) | 7950 | static inline int preempt_count_equals(int preempt_offset) |
9673 | { | 7951 | { |
9674 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | 7952 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
9675 | 7953 | ||
9676 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 7954 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); |
9677 | } | 7955 | } |
9678 | 7956 | ||
9679 | void __might_sleep(char *file, int line, int preempt_offset) | 7957 | void __might_sleep(const char *file, int line, int preempt_offset) |
9680 | { | 7958 | { |
9681 | #ifdef in_atomic | 7959 | #ifdef in_atomic |
9682 | static unsigned long prev_jiffy; /* ratelimiting */ | 7960 | static unsigned long prev_jiffy; /* ratelimiting */ |
@@ -9752,13 +8030,13 @@ void normalize_rt_tasks(void) | |||
9752 | continue; | 8030 | continue; |
9753 | } | 8031 | } |
9754 | 8032 | ||
9755 | spin_lock(&p->pi_lock); | 8033 | raw_spin_lock(&p->pi_lock); |
9756 | rq = __task_rq_lock(p); | 8034 | rq = __task_rq_lock(p); |
9757 | 8035 | ||
9758 | normalize_task(rq, p); | 8036 | normalize_task(rq, p); |
9759 | 8037 | ||
9760 | __task_rq_unlock(rq); | 8038 | __task_rq_unlock(rq); |
9761 | spin_unlock(&p->pi_lock); | 8039 | raw_spin_unlock(&p->pi_lock); |
9762 | } while_each_thread(g, p); | 8040 | } while_each_thread(g, p); |
9763 | 8041 | ||
9764 | read_unlock_irqrestore(&tasklist_lock, flags); | 8042 | read_unlock_irqrestore(&tasklist_lock, flags); |
@@ -9854,13 +8132,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
9854 | se = kzalloc_node(sizeof(struct sched_entity), | 8132 | se = kzalloc_node(sizeof(struct sched_entity), |
9855 | GFP_KERNEL, cpu_to_node(i)); | 8133 | GFP_KERNEL, cpu_to_node(i)); |
9856 | if (!se) | 8134 | if (!se) |
9857 | goto err; | 8135 | goto err_free_rq; |
9858 | 8136 | ||
9859 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8137 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); |
9860 | } | 8138 | } |
9861 | 8139 | ||
9862 | return 1; | 8140 | return 1; |
9863 | 8141 | ||
8142 | err_free_rq: | ||
8143 | kfree(cfs_rq); | ||
9864 | err: | 8144 | err: |
9865 | return 0; | 8145 | return 0; |
9866 | } | 8146 | } |
@@ -9942,13 +8222,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
9942 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | 8222 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), |
9943 | GFP_KERNEL, cpu_to_node(i)); | 8223 | GFP_KERNEL, cpu_to_node(i)); |
9944 | if (!rt_se) | 8224 | if (!rt_se) |
9945 | goto err; | 8225 | goto err_free_rq; |
9946 | 8226 | ||
9947 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8227 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); |
9948 | } | 8228 | } |
9949 | 8229 | ||
9950 | return 1; | 8230 | return 1; |
9951 | 8231 | ||
8232 | err_free_rq: | ||
8233 | kfree(rt_rq); | ||
9952 | err: | 8234 | err: |
9953 | return 0; | 8235 | return 0; |
9954 | } | 8236 | } |
@@ -9983,7 +8265,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
9983 | } | 8265 | } |
9984 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8266 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9985 | 8267 | ||
9986 | #ifdef CONFIG_GROUP_SCHED | 8268 | #ifdef CONFIG_CGROUP_SCHED |
9987 | static void free_sched_group(struct task_group *tg) | 8269 | static void free_sched_group(struct task_group *tg) |
9988 | { | 8270 | { |
9989 | free_fair_sched_group(tg); | 8271 | free_fair_sched_group(tg); |
@@ -10082,17 +8364,17 @@ void sched_move_task(struct task_struct *tsk) | |||
10082 | 8364 | ||
10083 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8365 | #ifdef CONFIG_FAIR_GROUP_SCHED |
10084 | if (tsk->sched_class->moved_group) | 8366 | if (tsk->sched_class->moved_group) |
10085 | tsk->sched_class->moved_group(tsk); | 8367 | tsk->sched_class->moved_group(tsk, on_rq); |
10086 | #endif | 8368 | #endif |
10087 | 8369 | ||
10088 | if (unlikely(running)) | 8370 | if (unlikely(running)) |
10089 | tsk->sched_class->set_curr_task(rq); | 8371 | tsk->sched_class->set_curr_task(rq); |
10090 | if (on_rq) | 8372 | if (on_rq) |
10091 | enqueue_task(rq, tsk, 0); | 8373 | enqueue_task(rq, tsk, 0, false); |
10092 | 8374 | ||
10093 | task_rq_unlock(rq, &flags); | 8375 | task_rq_unlock(rq, &flags); |
10094 | } | 8376 | } |
10095 | #endif /* CONFIG_GROUP_SCHED */ | 8377 | #endif /* CONFIG_CGROUP_SCHED */ |
10096 | 8378 | ||
10097 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8379 | #ifdef CONFIG_FAIR_GROUP_SCHED |
10098 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8380 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
@@ -10117,9 +8399,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
10117 | struct rq *rq = cfs_rq->rq; | 8399 | struct rq *rq = cfs_rq->rq; |
10118 | unsigned long flags; | 8400 | unsigned long flags; |
10119 | 8401 | ||
10120 | spin_lock_irqsave(&rq->lock, flags); | 8402 | raw_spin_lock_irqsave(&rq->lock, flags); |
10121 | __set_se_shares(se, shares); | 8403 | __set_se_shares(se, shares); |
10122 | spin_unlock_irqrestore(&rq->lock, flags); | 8404 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
10123 | } | 8405 | } |
10124 | 8406 | ||
10125 | static DEFINE_MUTEX(shares_mutex); | 8407 | static DEFINE_MUTEX(shares_mutex); |
@@ -10234,13 +8516,6 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
10234 | runtime = d->rt_runtime; | 8516 | runtime = d->rt_runtime; |
10235 | } | 8517 | } |
10236 | 8518 | ||
10237 | #ifdef CONFIG_USER_SCHED | ||
10238 | if (tg == &root_task_group) { | ||
10239 | period = global_rt_period(); | ||
10240 | runtime = global_rt_runtime(); | ||
10241 | } | ||
10242 | #endif | ||
10243 | |||
10244 | /* | 8519 | /* |
10245 | * Cannot have more runtime than the period. | 8520 | * Cannot have more runtime than the period. |
10246 | */ | 8521 | */ |
@@ -10304,18 +8579,18 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
10304 | if (err) | 8579 | if (err) |
10305 | goto unlock; | 8580 | goto unlock; |
10306 | 8581 | ||
10307 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8582 | raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
10308 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8583 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
10309 | tg->rt_bandwidth.rt_runtime = rt_runtime; | 8584 | tg->rt_bandwidth.rt_runtime = rt_runtime; |
10310 | 8585 | ||
10311 | for_each_possible_cpu(i) { | 8586 | for_each_possible_cpu(i) { |
10312 | struct rt_rq *rt_rq = tg->rt_rq[i]; | 8587 | struct rt_rq *rt_rq = tg->rt_rq[i]; |
10313 | 8588 | ||
10314 | spin_lock(&rt_rq->rt_runtime_lock); | 8589 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
10315 | rt_rq->rt_runtime = rt_runtime; | 8590 | rt_rq->rt_runtime = rt_runtime; |
10316 | spin_unlock(&rt_rq->rt_runtime_lock); | 8591 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
10317 | } | 8592 | } |
10318 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8593 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
10319 | unlock: | 8594 | unlock: |
10320 | read_unlock(&tasklist_lock); | 8595 | read_unlock(&tasklist_lock); |
10321 | mutex_unlock(&rt_constraints_mutex); | 8596 | mutex_unlock(&rt_constraints_mutex); |
@@ -10420,15 +8695,15 @@ static int sched_rt_global_constraints(void) | |||
10420 | if (sysctl_sched_rt_runtime == 0) | 8695 | if (sysctl_sched_rt_runtime == 0) |
10421 | return -EBUSY; | 8696 | return -EBUSY; |
10422 | 8697 | ||
10423 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 8698 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
10424 | for_each_possible_cpu(i) { | 8699 | for_each_possible_cpu(i) { |
10425 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 8700 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
10426 | 8701 | ||
10427 | spin_lock(&rt_rq->rt_runtime_lock); | 8702 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
10428 | rt_rq->rt_runtime = global_rt_runtime(); | 8703 | rt_rq->rt_runtime = global_rt_runtime(); |
10429 | spin_unlock(&rt_rq->rt_runtime_lock); | 8704 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
10430 | } | 8705 | } |
10431 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 8706 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
10432 | 8707 | ||
10433 | return 0; | 8708 | return 0; |
10434 | } | 8709 | } |
@@ -10643,7 +8918,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
10643 | struct cpuacct { | 8918 | struct cpuacct { |
10644 | struct cgroup_subsys_state css; | 8919 | struct cgroup_subsys_state css; |
10645 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 8920 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
10646 | u64 *cpuusage; | 8921 | u64 __percpu *cpuusage; |
10647 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 8922 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
10648 | struct cpuacct *parent; | 8923 | struct cpuacct *parent; |
10649 | }; | 8924 | }; |
@@ -10719,9 +8994,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | |||
10719 | /* | 8994 | /* |
10720 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | 8995 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. |
10721 | */ | 8996 | */ |
10722 | spin_lock_irq(&cpu_rq(cpu)->lock); | 8997 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
10723 | data = *cpuusage; | 8998 | data = *cpuusage; |
10724 | spin_unlock_irq(&cpu_rq(cpu)->lock); | 8999 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); |
10725 | #else | 9000 | #else |
10726 | data = *cpuusage; | 9001 | data = *cpuusage; |
10727 | #endif | 9002 | #endif |
@@ -10737,9 +9012,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
10737 | /* | 9012 | /* |
10738 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | 9013 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. |
10739 | */ | 9014 | */ |
10740 | spin_lock_irq(&cpu_rq(cpu)->lock); | 9015 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
10741 | *cpuusage = val; | 9016 | *cpuusage = val; |
10742 | spin_unlock_irq(&cpu_rq(cpu)->lock); | 9017 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); |
10743 | #else | 9018 | #else |
10744 | *cpuusage = val; | 9019 | *cpuusage = val; |
10745 | #endif | 9020 | #endif |
@@ -10860,12 +9135,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
10860 | } | 9135 | } |
10861 | 9136 | ||
10862 | /* | 9137 | /* |
9138 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9139 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9140 | * percpu_counter_add with values large enough to always overflow the | ||
9141 | * per cpu batch limit causing bad SMP scalability. | ||
9142 | * | ||
9143 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9144 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9145 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9146 | */ | ||
9147 | #ifdef CONFIG_SMP | ||
9148 | #define CPUACCT_BATCH \ | ||
9149 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9150 | #else | ||
9151 | #define CPUACCT_BATCH 0 | ||
9152 | #endif | ||
9153 | |||
9154 | /* | ||
10863 | * Charge the system/user time to the task's accounting group. | 9155 | * Charge the system/user time to the task's accounting group. |
10864 | */ | 9156 | */ |
10865 | static void cpuacct_update_stats(struct task_struct *tsk, | 9157 | static void cpuacct_update_stats(struct task_struct *tsk, |
10866 | enum cpuacct_stat_index idx, cputime_t val) | 9158 | enum cpuacct_stat_index idx, cputime_t val) |
10867 | { | 9159 | { |
10868 | struct cpuacct *ca; | 9160 | struct cpuacct *ca; |
9161 | int batch = CPUACCT_BATCH; | ||
10869 | 9162 | ||
10870 | if (unlikely(!cpuacct_subsys.active)) | 9163 | if (unlikely(!cpuacct_subsys.active)) |
10871 | return; | 9164 | return; |
@@ -10874,7 +9167,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, | |||
10874 | ca = task_ca(tsk); | 9167 | ca = task_ca(tsk); |
10875 | 9168 | ||
10876 | do { | 9169 | do { |
10877 | percpu_counter_add(&ca->cpustat[idx], val); | 9170 | __percpu_counter_add(&ca->cpustat[idx], val, batch); |
10878 | ca = ca->parent; | 9171 | ca = ca->parent; |
10879 | } while (ca); | 9172 | } while (ca); |
10880 | rcu_read_unlock(); | 9173 | rcu_read_unlock(); |
@@ -10973,9 +9266,9 @@ void synchronize_sched_expedited(void) | |||
10973 | init_completion(&req->done); | 9266 | init_completion(&req->done); |
10974 | req->task = NULL; | 9267 | req->task = NULL; |
10975 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | 9268 | req->dest_cpu = RCU_MIGRATION_NEED_QS; |
10976 | spin_lock_irqsave(&rq->lock, flags); | 9269 | raw_spin_lock_irqsave(&rq->lock, flags); |
10977 | list_add(&req->list, &rq->migration_queue); | 9270 | list_add(&req->list, &rq->migration_queue); |
10978 | spin_unlock_irqrestore(&rq->lock, flags); | 9271 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
10979 | wake_up_process(rq->migration_thread); | 9272 | wake_up_process(rq->migration_thread); |
10980 | } | 9273 | } |
10981 | for_each_online_cpu(cpu) { | 9274 | for_each_online_cpu(cpu) { |
@@ -10983,13 +9276,14 @@ void synchronize_sched_expedited(void) | |||
10983 | req = &per_cpu(rcu_migration_req, cpu); | 9276 | req = &per_cpu(rcu_migration_req, cpu); |
10984 | rq = cpu_rq(cpu); | 9277 | rq = cpu_rq(cpu); |
10985 | wait_for_completion(&req->done); | 9278 | wait_for_completion(&req->done); |
10986 | spin_lock_irqsave(&rq->lock, flags); | 9279 | raw_spin_lock_irqsave(&rq->lock, flags); |
10987 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | 9280 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) |
10988 | need_full_sync = 1; | 9281 | need_full_sync = 1; |
10989 | req->dest_cpu = RCU_MIGRATION_IDLE; | 9282 | req->dest_cpu = RCU_MIGRATION_IDLE; |
10990 | spin_unlock_irqrestore(&rq->lock, flags); | 9283 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
10991 | } | 9284 | } |
10992 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | 9285 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; |
9286 | synchronize_sched_expedited_count++; | ||
10993 | mutex_unlock(&rcu_sched_expedited_mutex); | 9287 | mutex_unlock(&rcu_sched_expedited_mutex); |
10994 | put_online_cpus(); | 9288 | put_online_cpus(); |
10995 | if (need_full_sync) | 9289 | if (need_full_sync) |